diff --git a/langfuse/api/__init__.py b/langfuse/api/__init__.py
index 0e036263a..46985c0b9 100644
--- a/langfuse/api/__init__.py
+++ b/langfuse/api/__init__.py
@@ -30,6 +30,7 @@
         scim,
         score_configs,
         scores,
+        scores_v3,
         sessions,
         trace,
         unstable,
@@ -297,6 +298,31 @@
         GetScoresResponseData_Text,
         GetScoresResponseTraceData,
     )
+    from .scores_v3 import (
+        BaseScoreV3,
+        BooleanScoreV3,
+        CategoricalScoreV3,
+        CorrectionScoreV3,
+        GetScoresV3Meta,
+        GetScoresV3Response,
+        NumericScoreV3,
+        ScoreSubjectExperimentV3,
+        ScoreSubjectObservationV3,
+        ScoreSubjectSessionV3,
+        ScoreSubjectTraceV3,
+        ScoreSubjectV3,
+        ScoreSubjectV3_Experiment,
+        ScoreSubjectV3_Observation,
+        ScoreSubjectV3_Session,
+        ScoreSubjectV3_Trace,
+        ScoreV3,
+        ScoreV3_Boolean,
+        ScoreV3_Categorical,
+        ScoreV3_Correction,
+        ScoreV3_Numeric,
+        ScoreV3_Text,
+        TextScoreV3,
+    )
     from .sessions import PaginatedSessions
     from .trace import DeleteTraceResponse, Sort, Traces
 _dynamic_imports: typing.Dict[str, str] = {
@@ -316,6 +342,7 @@
     "BasePrompt": ".prompts",
     "BaseScore": ".commons",
     "BaseScoreV1": ".commons",
+    "BaseScoreV3": ".scores_v3",
     "BlobStorageExportFieldGroup": ".blob_storage_integrations",
     "BlobStorageExportFrequency": ".blob_storage_integrations",
     "BlobStorageExportMode": ".blob_storage_integrations",
@@ -329,9 +356,11 @@
     "BlobStorageSyncStatus": ".blob_storage_integrations",
     "BooleanScore": ".commons",
     "BooleanScoreV1": ".commons",
+    "BooleanScoreV3": ".scores_v3",
     "BulkConfig": ".scim",
     "CategoricalScore": ".commons",
     "CategoricalScoreV1": ".commons",
+    "CategoricalScoreV3": ".scores_v3",
     "ChatMessage": ".prompts",
     "ChatMessageType": ".prompts",
     "ChatMessageWithPlaceholders": ".prompts",
@@ -340,6 +369,7 @@
     "CommentObjectType": ".commons",
     "ConfigCategory": ".commons",
     "CorrectionScore": ".commons",
+    "CorrectionScoreV3": ".scores_v3",
     "CreateAnnotationQueueAssignmentResponse": ".annotation_queues",
     "CreateAnnotationQueueItemRequest": ".annotation_queues",
     "CreateAnnotationQueueRequest": ".annotation_queues",
@@ -397,6 +427,8 @@
     "GetScoresResponseData_Numeric": ".scores",
     "GetScoresResponseData_Text": ".scores",
     "GetScoresResponseTraceData": ".scores",
+    "GetScoresV3Meta": ".scores_v3",
+    "GetScoresV3Response": ".scores_v3",
     "HealthResponse": ".health",
     "IngestionError": ".ingestion",
     "IngestionEvent": ".ingestion",
@@ -431,6 +463,7 @@
     "NotFoundError": ".commons",
     "NumericScore": ".commons",
     "NumericScoreV1": ".commons",
+    "NumericScoreV3": ".scores_v3",
     "Observation": ".commons",
     "ObservationBody": ".ingestion",
     "ObservationLevel": ".commons",
@@ -500,11 +533,26 @@
     "ScoreDataType": ".commons",
     "ScoreEvent": ".ingestion",
     "ScoreSource": ".commons",
+    "ScoreSubjectExperimentV3": ".scores_v3",
+    "ScoreSubjectObservationV3": ".scores_v3",
+    "ScoreSubjectSessionV3": ".scores_v3",
+    "ScoreSubjectTraceV3": ".scores_v3",
+    "ScoreSubjectV3": ".scores_v3",
+    "ScoreSubjectV3_Experiment": ".scores_v3",
+    "ScoreSubjectV3_Observation": ".scores_v3",
+    "ScoreSubjectV3_Session": ".scores_v3",
+    "ScoreSubjectV3_Trace": ".scores_v3",
     "ScoreV1": ".commons",
     "ScoreV1_Boolean": ".commons",
     "ScoreV1_Categorical": ".commons",
     "ScoreV1_Numeric": ".commons",
     "ScoreV1_Text": ".commons",
+    "ScoreV3": ".scores_v3",
+    "ScoreV3_Boolean": ".scores_v3",
+    "ScoreV3_Categorical": ".scores_v3",
+    "ScoreV3_Correction": ".scores_v3",
+    "ScoreV3_Numeric": ".scores_v3",
+    "ScoreV3_Text": ".scores_v3",
     "Score_Boolean": ".commons",
     "Score_Categorical": ".commons",
     "Score_Correction": ".commons",
@@ -520,6 +568,7 @@
     "TextPrompt": ".prompts",
     "TextScore": ".commons",
     "TextScoreV1": ".commons",
+    "TextScoreV3": ".scores_v3",
     "Trace": ".commons",
     "TraceBody": ".ingestion",
     "TraceEvent": ".ingestion",
@@ -562,6 +611,7 @@
     "scim": ".scim",
     "score_configs": ".score_configs",
     "scores": ".scores",
+    "scores_v3": ".scores_v3",
     "sessions": ".sessions",
     "trace": ".trace",
     "unstable": ".unstable",
@@ -613,6 +663,7 @@ def __dir__():
     "BasePrompt",
     "BaseScore",
     "BaseScoreV1",
+    "BaseScoreV3",
     "BlobStorageExportFieldGroup",
     "BlobStorageExportFrequency",
     "BlobStorageExportMode",
@@ -626,9 +677,11 @@ def __dir__():
     "BlobStorageSyncStatus",
     "BooleanScore",
     "BooleanScoreV1",
+    "BooleanScoreV3",
     "BulkConfig",
     "CategoricalScore",
     "CategoricalScoreV1",
+    "CategoricalScoreV3",
     "ChatMessage",
     "ChatMessageType",
     "ChatMessageWithPlaceholders",
@@ -637,6 +690,7 @@ def __dir__():
     "CommentObjectType",
     "ConfigCategory",
     "CorrectionScore",
+    "CorrectionScoreV3",
     "CreateAnnotationQueueAssignmentResponse",
     "CreateAnnotationQueueItemRequest",
     "CreateAnnotationQueueRequest",
@@ -694,6 +748,8 @@ def __dir__():
     "GetScoresResponseData_Numeric",
     "GetScoresResponseData_Text",
     "GetScoresResponseTraceData",
+    "GetScoresV3Meta",
+    "GetScoresV3Response",
     "HealthResponse",
     "IngestionError",
     "IngestionEvent",
@@ -728,6 +784,7 @@ def __dir__():
     "NotFoundError",
     "NumericScore",
     "NumericScoreV1",
+    "NumericScoreV3",
     "Observation",
     "ObservationBody",
     "ObservationLevel",
@@ -797,11 +854,26 @@ def __dir__():
     "ScoreDataType",
     "ScoreEvent",
     "ScoreSource",
+    "ScoreSubjectExperimentV3",
+    "ScoreSubjectObservationV3",
+    "ScoreSubjectSessionV3",
+    "ScoreSubjectTraceV3",
+    "ScoreSubjectV3",
+    "ScoreSubjectV3_Experiment",
+    "ScoreSubjectV3_Observation",
+    "ScoreSubjectV3_Session",
+    "ScoreSubjectV3_Trace",
     "ScoreV1",
     "ScoreV1_Boolean",
     "ScoreV1_Categorical",
     "ScoreV1_Numeric",
     "ScoreV1_Text",
+    "ScoreV3",
+    "ScoreV3_Boolean",
+    "ScoreV3_Categorical",
+    "ScoreV3_Correction",
+    "ScoreV3_Numeric",
+    "ScoreV3_Text",
     "Score_Boolean",
     "Score_Categorical",
     "Score_Correction",
@@ -817,6 +889,7 @@ def __dir__():
     "TextPrompt",
     "TextScore",
     "TextScoreV1",
+    "TextScoreV3",
     "Trace",
     "TraceBody",
     "TraceEvent",
@@ -859,6 +932,7 @@ def __dir__():
     "scim",
     "score_configs",
     "scores",
+    "scores_v3",
     "sessions",
     "trace",
     "unstable",
diff --git a/langfuse/api/client.py b/langfuse/api/client.py
index c0413704b..a72aede85 100644
--- a/langfuse/api/client.py
+++ b/langfuse/api/client.py
@@ -39,6 +39,7 @@
     from .scim.client import AsyncScimClient, ScimClient
     from .score_configs.client import AsyncScoreConfigsClient, ScoreConfigsClient
     from .scores.client import AsyncScoresClient, ScoresClient
+    from .scores_v3.client import AsyncScoresV3Client, ScoresV3Client
     from .sessions.client import AsyncSessionsClient, SessionsClient
     from .trace.client import AsyncTraceClient, TraceClient
     from .unstable.client import AsyncUnstableClient, UnstableClient
@@ -145,6 +146,7 @@ def __init__(
         self._prompts: typing.Optional[PromptsClient] = None
         self._scim: typing.Optional[ScimClient] = None
         self._score_configs: typing.Optional[ScoreConfigsClient] = None
+        self._scores_v3: typing.Optional[ScoresV3Client] = None
         self._scores: typing.Optional[ScoresClient] = None
         self._sessions: typing.Optional[SessionsClient] = None
         self._trace: typing.Optional[TraceClient] = None
@@ -336,6 +338,14 @@ def score_configs(self):
             )
         return self._score_configs
 
+    @property
+    def scores_v3(self):
+        if self._scores_v3 is None:
+            from .scores_v3.client import ScoresV3Client  # noqa: E402
+
+            self._scores_v3 = ScoresV3Client(client_wrapper=self._client_wrapper)
+        return self._scores_v3
+
     @property
     def scores(self):
         if self._scores is None:
@@ -470,6 +480,7 @@ def __init__(
         self._prompts: typing.Optional[AsyncPromptsClient] = None
         self._scim: typing.Optional[AsyncScimClient] = None
         self._score_configs: typing.Optional[AsyncScoreConfigsClient] = None
+        self._scores_v3: typing.Optional[AsyncScoresV3Client] = None
         self._scores: typing.Optional[AsyncScoresClient] = None
         self._sessions: typing.Optional[AsyncSessionsClient] = None
         self._trace: typing.Optional[AsyncTraceClient] = None
@@ -665,6 +676,14 @@ def score_configs(self):
             )
         return self._score_configs
 
+    @property
+    def scores_v3(self):
+        if self._scores_v3 is None:
+            from .scores_v3.client import AsyncScoresV3Client  # noqa: E402
+
+            self._scores_v3 = AsyncScoresV3Client(client_wrapper=self._client_wrapper)
+        return self._scores_v3
+
     @property
     def scores(self):
         if self._scores is None:
diff --git a/langfuse/api/scores_v3/__init__.py b/langfuse/api/scores_v3/__init__.py
new file mode 100644
index 000000000..855868335
--- /dev/null
+++ b/langfuse/api/scores_v3/__init__.py
@@ -0,0 +1,112 @@
+# This file was auto-generated by Fern from our API Definition.
+
+# isort: skip_file
+
+import typing
+from importlib import import_module
+
+if typing.TYPE_CHECKING:
+    from .types import (
+        BaseScoreV3,
+        BooleanScoreV3,
+        CategoricalScoreV3,
+        CorrectionScoreV3,
+        GetScoresV3Meta,
+        GetScoresV3Response,
+        NumericScoreV3,
+        ScoreSubjectExperimentV3,
+        ScoreSubjectObservationV3,
+        ScoreSubjectSessionV3,
+        ScoreSubjectTraceV3,
+        ScoreSubjectV3,
+        ScoreSubjectV3_Experiment,
+        ScoreSubjectV3_Observation,
+        ScoreSubjectV3_Session,
+        ScoreSubjectV3_Trace,
+        ScoreV3,
+        ScoreV3_Boolean,
+        ScoreV3_Categorical,
+        ScoreV3_Correction,
+        ScoreV3_Numeric,
+        ScoreV3_Text,
+        TextScoreV3,
+    )
+_dynamic_imports: typing.Dict[str, str] = {
+    "BaseScoreV3": ".types",
+    "BooleanScoreV3": ".types",
+    "CategoricalScoreV3": ".types",
+    "CorrectionScoreV3": ".types",
+    "GetScoresV3Meta": ".types",
+    "GetScoresV3Response": ".types",
+    "NumericScoreV3": ".types",
+    "ScoreSubjectExperimentV3": ".types",
+    "ScoreSubjectObservationV3": ".types",
+    "ScoreSubjectSessionV3": ".types",
+    "ScoreSubjectTraceV3": ".types",
+    "ScoreSubjectV3": ".types",
+    "ScoreSubjectV3_Experiment": ".types",
+    "ScoreSubjectV3_Observation": ".types",
+    "ScoreSubjectV3_Session": ".types",
+    "ScoreSubjectV3_Trace": ".types",
+    "ScoreV3": ".types",
+    "ScoreV3_Boolean": ".types",
+    "ScoreV3_Categorical": ".types",
+    "ScoreV3_Correction": ".types",
+    "ScoreV3_Numeric": ".types",
+    "ScoreV3_Text": ".types",
+    "TextScoreV3": ".types",
+}
+
+
+def __getattr__(attr_name: str) -> typing.Any:
+    module_name = _dynamic_imports.get(attr_name)
+    if module_name is None:
+        raise AttributeError(
+            f"No {attr_name} found in _dynamic_imports for module name -> {__name__}"
+        )
+    try:
+        module = import_module(module_name, __package__)
+        if module_name == f".{attr_name}":
+            return module
+        else:
+            return getattr(module, attr_name)
+    except ImportError as e:
+        raise ImportError(
+            f"Failed to import {attr_name} from {module_name}: {e}"
+        ) from e
+    except AttributeError as e:
+        raise AttributeError(
+            f"Failed to get {attr_name} from {module_name}: {e}"
+        ) from e
+
+
+def __dir__():
+    lazy_attrs = list(_dynamic_imports.keys())
+    return sorted(lazy_attrs)
+
+
+__all__ = [
+    "BaseScoreV3",
+    "BooleanScoreV3",
+    "CategoricalScoreV3",
+    "CorrectionScoreV3",
+    "GetScoresV3Meta",
+    "GetScoresV3Response",
+    "NumericScoreV3",
+    "ScoreSubjectExperimentV3",
+    "ScoreSubjectObservationV3",
+    "ScoreSubjectSessionV3",
+    "ScoreSubjectTraceV3",
+    "ScoreSubjectV3",
+    "ScoreSubjectV3_Experiment",
+    "ScoreSubjectV3_Observation",
+    "ScoreSubjectV3_Session",
+    "ScoreSubjectV3_Trace",
+    "ScoreV3",
+    "ScoreV3_Boolean",
+    "ScoreV3_Categorical",
+    "ScoreV3_Correction",
+    "ScoreV3_Numeric",
+    "ScoreV3_Text",
+    "TextScoreV3",
+]
diff --git a/langfuse/api/scores_v3/client.py b/langfuse/api/scores_v3/client.py
new file mode 100644
index 000000000..2755d3e74
--- /dev/null
+++ b/langfuse/api/scores_v3/client.py
@@ -0,0 +1,341 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import datetime as dt
+import typing
+
+from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
+from ..core.request_options import RequestOptions
+from .raw_client import AsyncRawScoresV3Client, RawScoresV3Client
+from .types.get_scores_v3response import GetScoresV3Response
+
+
+class ScoresV3Client:
+    def __init__(self, *, client_wrapper: SyncClientWrapper):
+        self._raw_client = RawScoresV3Client(client_wrapper=client_wrapper)
+
+    @property
+    def with_raw_response(self) -> RawScoresV3Client:
+        """
+        Retrieves a raw implementation of this client that returns raw responses.
+
+        Returns
+        -------
+        RawScoresV3Client
+        """
+        return self._raw_client
+
+    def get_many_v3(
+        self,
+        *,
+        limit: typing.Optional[int] = None,
+        cursor: typing.Optional[str] = None,
+        fields: typing.Optional[str] = None,
+        id: typing.Optional[str] = None,
+        name: typing.Optional[str] = None,
+        source: typing.Optional[str] = None,
+        data_type: typing.Optional[str] = None,
+        environment: typing.Optional[str] = None,
+        config_id: typing.Optional[str] = None,
+        queue_id: typing.Optional[str] = None,
+        author_user_id: typing.Optional[str] = None,
+        value: typing.Optional[str] = None,
+        value_min: typing.Optional[float] = None,
+        value_max: typing.Optional[float] = None,
+        trace_id: typing.Optional[str] = None,
+        session_id: typing.Optional[str] = None,
+        observation_id: typing.Optional[str] = None,
+        experiment_id: typing.Optional[str] = None,
+        from_timestamp: typing.Optional[dt.datetime] = None,
+        to_timestamp: typing.Optional[dt.datetime] = None,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> GetScoresV3Response:
+        """
+        Get a list of scores with a polymorphic `value` field (v3).
+
+        This endpoint requires Langfuse v4 or later.
+
+        The `value` field type depends on `dataType`:
+        - `NUMERIC` → number
+        - `BOOLEAN` → boolean
+        - `CATEGORICAL`, `TEXT`, `CORRECTION` → string
+
+        Use the `fields` parameter to include optional field groups beyond the
+        default `core`. Unknown group names return HTTP 400.
+
+        Parameters
+        ----------
+        limit : typing.Optional[int]
+            Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400.
+
+        cursor : typing.Optional[str]
+            URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page.
+
+        fields : typing.Optional[str]
+            Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400.
+
+        id : typing.Optional[str]
+            Comma-separated list of score IDs to filter by (OR within, AND across filters).
+
+        name : typing.Optional[str]
+            Comma-separated list of score names to filter by.
+
+        source : typing.Optional[str]
+            Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent.
+
+        data_type : typing.Optional[str]
+            Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax.
+
+        environment : typing.Optional[str]
+            Comma-separated list of environments to filter by.
+
+        config_id : typing.Optional[str]
+            Comma-separated list of score config IDs to filter by.
+
+        queue_id : typing.Optional[str]
+            Comma-separated list of annotation queue IDs to filter by.
+
+        author_user_id : typing.Optional[str]
+            Comma-separated list of author user IDs to filter by.
+
+        value : typing.Optional[str]
+            Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400.
+
+        value_min : typing.Optional[float]
+            Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        value_max : typing.Optional[float]
+            Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        trace_id : typing.Optional[str]
+            Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace.
+
+        session_id : typing.Optional[str]
+            Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId.
+
+        observation_id : typing.Optional[str]
+            Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId.
+
+        experiment_id : typing.Optional[str]
+            Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId.
+
+        from_timestamp : typing.Optional[dt.datetime]
+            Inclusive lower bound on the score timestamp.
+
+        to_timestamp : typing.Optional[dt.datetime]
+            Exclusive upper bound on the score timestamp.
+
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+
+        Returns
+        -------
+        GetScoresV3Response
+
+        Examples
+        --------
+        from langfuse import LangfuseAPI
+
+        client = LangfuseAPI(
+            x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME",
+            x_langfuse_sdk_version="YOUR_X_LANGFUSE_SDK_VERSION",
+            x_langfuse_public_key="YOUR_X_LANGFUSE_PUBLIC_KEY",
+            username="YOUR_USERNAME",
+            password="YOUR_PASSWORD",
+            base_url="https://yourhost.com/path/to/api",
+        )
+        client.scores_v3.get_many_v3()
+        """
+        _response = self._raw_client.get_many_v3(
+            limit=limit,
+            cursor=cursor,
+            fields=fields,
+            id=id,
+            name=name,
+            source=source,
+            data_type=data_type,
+            environment=environment,
+            config_id=config_id,
+            queue_id=queue_id,
+            author_user_id=author_user_id,
+            value=value,
+            value_min=value_min,
+            value_max=value_max,
+            trace_id=trace_id,
+            session_id=session_id,
+            observation_id=observation_id,
+            experiment_id=experiment_id,
+            from_timestamp=from_timestamp,
+            to_timestamp=to_timestamp,
+            request_options=request_options,
+        )
+        return _response.data
+
+
+class AsyncScoresV3Client:
+    def __init__(self, *, client_wrapper: AsyncClientWrapper):
+        self._raw_client = AsyncRawScoresV3Client(client_wrapper=client_wrapper)
+
+    @property
+    def with_raw_response(self) -> AsyncRawScoresV3Client:
+        """
+        Retrieves a raw implementation of this client that returns raw responses.
+
+        Returns
+        -------
+        AsyncRawScoresV3Client
+        """
+        return self._raw_client
+
+    async def get_many_v3(
+        self,
+        *,
+        limit: typing.Optional[int] = None,
+        cursor: typing.Optional[str] = None,
+        fields: typing.Optional[str] = None,
+        id: typing.Optional[str] = None,
+        name: typing.Optional[str] = None,
+        source: typing.Optional[str] = None,
+        data_type: typing.Optional[str] = None,
+        environment: typing.Optional[str] = None,
+        config_id: typing.Optional[str] = None,
+        queue_id: typing.Optional[str] = None,
+        author_user_id: typing.Optional[str] = None,
+        value: typing.Optional[str] = None,
+        value_min: typing.Optional[float] = None,
+        value_max: typing.Optional[float] = None,
+        trace_id: typing.Optional[str] = None,
+        session_id: typing.Optional[str] = None,
+        observation_id: typing.Optional[str] = None,
+        experiment_id: typing.Optional[str] = None,
+        from_timestamp: typing.Optional[dt.datetime] = None,
+        to_timestamp: typing.Optional[dt.datetime] = None,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> GetScoresV3Response:
+        """
+        Get a list of scores with a polymorphic `value` field (v3).
+
+        This endpoint requires Langfuse v4 or later.
+
+        The `value` field type depends on `dataType`:
+        - `NUMERIC` → number
+        - `BOOLEAN` → boolean
+        - `CATEGORICAL`, `TEXT`, `CORRECTION` → string
+
+        Use the `fields` parameter to include optional field groups beyond the
+        default `core`. Unknown group names return HTTP 400.
+
+        Parameters
+        ----------
+        limit : typing.Optional[int]
+            Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400.
+
+        cursor : typing.Optional[str]
+            URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page.
+
+        fields : typing.Optional[str]
+            Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400.
+
+        id : typing.Optional[str]
+            Comma-separated list of score IDs to filter by (OR within, AND across filters).
+
+        name : typing.Optional[str]
+            Comma-separated list of score names to filter by.
+
+        source : typing.Optional[str]
+            Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent.
+
+        data_type : typing.Optional[str]
+            Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax.
+
+        environment : typing.Optional[str]
+            Comma-separated list of environments to filter by.
+
+        config_id : typing.Optional[str]
+            Comma-separated list of score config IDs to filter by.
+
+        queue_id : typing.Optional[str]
+            Comma-separated list of annotation queue IDs to filter by.
+
+        author_user_id : typing.Optional[str]
+            Comma-separated list of author user IDs to filter by.
+
+        value : typing.Optional[str]
+            Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400.
+
+        value_min : typing.Optional[float]
+            Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        value_max : typing.Optional[float]
+            Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        trace_id : typing.Optional[str]
+            Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace.
+
+        session_id : typing.Optional[str]
+            Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId.
+
+        observation_id : typing.Optional[str]
+            Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId.
+
+        experiment_id : typing.Optional[str]
+            Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId.
+
+        from_timestamp : typing.Optional[dt.datetime]
+            Inclusive lower bound on the score timestamp.
+
+        to_timestamp : typing.Optional[dt.datetime]
+            Exclusive upper bound on the score timestamp.
+
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+
+        Returns
+        -------
+        GetScoresV3Response
+
+        Examples
+        --------
+        import asyncio
+
+        from langfuse import AsyncLangfuseAPI
+
+        client = AsyncLangfuseAPI(
+            x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME",
+            x_langfuse_sdk_version="YOUR_X_LANGFUSE_SDK_VERSION",
+            x_langfuse_public_key="YOUR_X_LANGFUSE_PUBLIC_KEY",
+            username="YOUR_USERNAME",
+            password="YOUR_PASSWORD",
+            base_url="https://yourhost.com/path/to/api",
+        )
+
+
+        async def main() -> None:
+            await client.scores_v3.get_many_v3()
+
+
+        asyncio.run(main())
+        """
+        _response = await self._raw_client.get_many_v3(
+            limit=limit,
+            cursor=cursor,
+            fields=fields,
+            id=id,
+            name=name,
+            source=source,
+            data_type=data_type,
+            environment=environment,
+            config_id=config_id,
+            queue_id=queue_id,
+            author_user_id=author_user_id,
+            value=value,
+            value_min=value_min,
+            value_max=value_max,
+            trace_id=trace_id,
+            session_id=session_id,
+            observation_id=observation_id,
+            experiment_id=experiment_id,
+            from_timestamp=from_timestamp,
+            to_timestamp=to_timestamp,
+            request_options=request_options,
+        )
+        return _response.data
diff --git a/langfuse/api/scores_v3/raw_client.py b/langfuse/api/scores_v3/raw_client.py
new file mode 100644
index 000000000..47c9f3f8d
--- /dev/null
+++ b/langfuse/api/scores_v3/raw_client.py
@@ -0,0 +1,460 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import datetime as dt
+import typing
+from json.decoder import JSONDecodeError
+
+from ..commons.errors.access_denied_error import AccessDeniedError
+from ..commons.errors.error import Error
+from ..commons.errors.method_not_allowed_error import MethodNotAllowedError
+from ..commons.errors.not_found_error import NotFoundError
+from ..commons.errors.unauthorized_error import UnauthorizedError
+from ..core.api_error import ApiError
+from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
+from ..core.datetime_utils import serialize_datetime
+from ..core.http_response import AsyncHttpResponse, HttpResponse
+from ..core.pydantic_utilities import parse_obj_as
+from ..core.request_options import RequestOptions
+from .types.get_scores_v3response import GetScoresV3Response
+
+
+class RawScoresV3Client:
+    def __init__(self, *, client_wrapper: SyncClientWrapper):
+        self._client_wrapper = client_wrapper
+
+    def get_many_v3(
+        self,
+        *,
+        limit: typing.Optional[int] = None,
+        cursor: typing.Optional[str] = None,
+        fields: typing.Optional[str] = None,
+        id: typing.Optional[str] = None,
+        name: typing.Optional[str] = None,
+        source: typing.Optional[str] = None,
+        data_type: typing.Optional[str] = None,
+        environment: typing.Optional[str] = None,
+        config_id: typing.Optional[str] = None,
+        queue_id: typing.Optional[str] = None,
+        author_user_id: typing.Optional[str] = None,
+        value: typing.Optional[str] = None,
+        value_min: typing.Optional[float] = None,
+        value_max: typing.Optional[float] = None,
+        trace_id: typing.Optional[str] = None,
+        session_id: typing.Optional[str] = None,
+        observation_id: typing.Optional[str] = None,
+        experiment_id: typing.Optional[str] = None,
+        from_timestamp: typing.Optional[dt.datetime] = None,
+        to_timestamp: typing.Optional[dt.datetime] = None,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> HttpResponse[GetScoresV3Response]:
+        """
+        Get a list of scores with a polymorphic `value` field (v3).
+
+        This endpoint requires Langfuse v4 or later.
+
+        The `value` field type depends on `dataType`:
+        - `NUMERIC` → number
+        - `BOOLEAN` → boolean
+        - `CATEGORICAL`, `TEXT`, `CORRECTION` → string
+
+        Use the `fields` parameter to include optional field groups beyond the
+        default `core`. Unknown group names return HTTP 400.
+
+        Parameters
+        ----------
+        limit : typing.Optional[int]
+            Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400.
+
+        cursor : typing.Optional[str]
+            URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page.
+
+        fields : typing.Optional[str]
+            Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400.
+
+        id : typing.Optional[str]
+            Comma-separated list of score IDs to filter by (OR within, AND across filters).
+
+        name : typing.Optional[str]
+            Comma-separated list of score names to filter by.
+
+        source : typing.Optional[str]
+            Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent.
+
+        data_type : typing.Optional[str]
+            Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax.
+
+        environment : typing.Optional[str]
+            Comma-separated list of environments to filter by.
+
+        config_id : typing.Optional[str]
+            Comma-separated list of score config IDs to filter by.
+
+        queue_id : typing.Optional[str]
+            Comma-separated list of annotation queue IDs to filter by.
+
+        author_user_id : typing.Optional[str]
+            Comma-separated list of author user IDs to filter by.
+
+        value : typing.Optional[str]
+            Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400.
+
+        value_min : typing.Optional[float]
+            Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        value_max : typing.Optional[float]
+            Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        trace_id : typing.Optional[str]
+            Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace.
+
+        session_id : typing.Optional[str]
+            Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId.
+
+        observation_id : typing.Optional[str]
+            Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId.
+
+        experiment_id : typing.Optional[str]
+            Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId.
+
+        from_timestamp : typing.Optional[dt.datetime]
+            Inclusive lower bound on the score timestamp.
+
+        to_timestamp : typing.Optional[dt.datetime]
+            Exclusive upper bound on the score timestamp.
+
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+
+        Returns
+        -------
+        HttpResponse[GetScoresV3Response]
+        """
+        _response = self._client_wrapper.httpx_client.request(
+            "api/public/v3/scores",
+            method="GET",
+            params={
+                "limit": limit,
+                "cursor": cursor,
+                "fields": fields,
+                "id": id,
+                "name": name,
+                "source": source,
+                "dataType": data_type,
+                "environment": environment,
+                "configId": config_id,
+                "queueId": queue_id,
+                "authorUserId": author_user_id,
+                "value": value,
+                "valueMin": value_min,
+                "valueMax": value_max,
+                "traceId": trace_id,
+                "sessionId": session_id,
+                "observationId": observation_id,
+                "experimentId": experiment_id,
+                "fromTimestamp": serialize_datetime(from_timestamp)
+                if from_timestamp is not None
+                else None,
+                "toTimestamp": serialize_datetime(to_timestamp)
+                if to_timestamp is not None
+                else None,
+            },
+            request_options=request_options,
+        )
+        try:
+            if 200 <= _response.status_code < 300:
+                _data = typing.cast(
+                    GetScoresV3Response,
+                    parse_obj_as(
+                        type_=GetScoresV3Response,  # type: ignore
+                        object_=_response.json(),
+                    ),
+                )
+                return HttpResponse(response=_response, data=_data)
+            if _response.status_code == 400:
+                raise Error(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 401:
+                raise UnauthorizedError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 403:
+                raise AccessDeniedError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 405:
+                raise MethodNotAllowedError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 404:
+                raise NotFoundError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            _response_json = _response.json()
+        except JSONDecodeError:
+            raise ApiError(
+                status_code=_response.status_code,
+                headers=dict(_response.headers),
+                body=_response.text,
+            )
+        raise ApiError(
+            status_code=_response.status_code,
+            headers=dict(_response.headers),
+            body=_response_json,
+        )
+
+
+class AsyncRawScoresV3Client:
+    def __init__(self, *, client_wrapper: AsyncClientWrapper):
+        self._client_wrapper = client_wrapper
+
+    async def get_many_v3(
+        self,
+        *,
+        limit: typing.Optional[int] = None,
+        cursor: typing.Optional[str] = None,
+        fields: typing.Optional[str] = None,
+        id: typing.Optional[str] = None,
+        name: typing.Optional[str] = None,
+        source: typing.Optional[str] = None,
+        data_type: typing.Optional[str] = None,
+        environment: typing.Optional[str] = None,
+        config_id: typing.Optional[str] = None,
+        queue_id: typing.Optional[str] = None,
+        author_user_id: typing.Optional[str] = None,
+        value: typing.Optional[str] = None,
+        value_min: typing.Optional[float] = None,
+        value_max: typing.Optional[float] = None,
+        trace_id: typing.Optional[str] = None,
+        session_id: typing.Optional[str] = None,
+        observation_id: typing.Optional[str] = None,
+        experiment_id: typing.Optional[str] = None,
+        from_timestamp: typing.Optional[dt.datetime] = None,
+        to_timestamp: typing.Optional[dt.datetime] = None,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> AsyncHttpResponse[GetScoresV3Response]:
+        """
+        Get a list of scores with a polymorphic `value` field (v3).
+
+        This endpoint requires Langfuse v4 or later.
+
+        The `value` field type depends on `dataType`:
+        - `NUMERIC` → number
+        - `BOOLEAN` → boolean
+        - `CATEGORICAL`, `TEXT`, `CORRECTION` → string
+
+        Use the `fields` parameter to include optional field groups beyond the
+        default `core`. Unknown group names return HTTP 400.
+
+        Parameters
+        ----------
+        limit : typing.Optional[int]
+            Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400.
+
+        cursor : typing.Optional[str]
+            URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page.
+
+        fields : typing.Optional[str]
+            Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400.
+
+        id : typing.Optional[str]
+            Comma-separated list of score IDs to filter by (OR within, AND across filters).
+
+        name : typing.Optional[str]
+            Comma-separated list of score names to filter by.
+
+        source : typing.Optional[str]
+            Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent.
+
+        data_type : typing.Optional[str]
+            Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax.
+
+        environment : typing.Optional[str]
+            Comma-separated list of environments to filter by.
+
+        config_id : typing.Optional[str]
+            Comma-separated list of score config IDs to filter by.
+
+        queue_id : typing.Optional[str]
+            Comma-separated list of annotation queue IDs to filter by.
+
+        author_user_id : typing.Optional[str]
+            Comma-separated list of author user IDs to filter by.
+
+        value : typing.Optional[str]
+            Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400.
+
+        value_min : typing.Optional[float]
+            Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        value_max : typing.Optional[float]
+            Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400.
+
+        trace_id : typing.Optional[str]
+            Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace.
+
+        session_id : typing.Optional[str]
+            Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId.
+
+        observation_id : typing.Optional[str]
+            Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId.
+
+        experiment_id : typing.Optional[str]
+            Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId.
+
+        from_timestamp : typing.Optional[dt.datetime]
+            Inclusive lower bound on the score timestamp.
+
+        to_timestamp : typing.Optional[dt.datetime]
+            Exclusive upper bound on the score timestamp.
+
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+
+        Returns
+        -------
+        AsyncHttpResponse[GetScoresV3Response]
+        """
+        _response = await self._client_wrapper.httpx_client.request(
+            "api/public/v3/scores",
+            method="GET",
+            params={
+                "limit": limit,
+                "cursor": cursor,
+                "fields": fields,
+                "id": id,
+                "name": name,
+                "source": source,
+                "dataType": data_type,
+                "environment": environment,
+                "configId": config_id,
+                "queueId": queue_id,
+                "authorUserId": author_user_id,
+                "value": value,
+                "valueMin": value_min,
+                "valueMax": value_max,
+                "traceId": trace_id,
+                "sessionId": session_id,
+                "observationId": observation_id,
+                "experimentId": experiment_id,
+                "fromTimestamp": serialize_datetime(from_timestamp)
+                if from_timestamp is not None
+                else None,
+                "toTimestamp": serialize_datetime(to_timestamp)
+                if to_timestamp is not None
+                else None,
+            },
+            request_options=request_options,
+        )
+        try:
+            if 200 <= _response.status_code < 300:
+                _data = typing.cast(
+                    GetScoresV3Response,
+                    parse_obj_as(
+                        type_=GetScoresV3Response,  # type: ignore
+                        object_=_response.json(),
+                    ),
+                )
+                return AsyncHttpResponse(response=_response, data=_data)
+            if _response.status_code == 400:
+                raise Error(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 401:
+                raise UnauthorizedError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 403:
+                raise AccessDeniedError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 405:
+                raise MethodNotAllowedError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            if _response.status_code == 404:
+                raise NotFoundError(
+                    headers=dict(_response.headers),
+                    body=typing.cast(
+                        typing.Any,
+                        parse_obj_as(
+                            type_=typing.Any,  # type: ignore
+                            object_=_response.json(),
+                        ),
+                    ),
+                )
+            _response_json = _response.json()
+        except JSONDecodeError:
+            raise ApiError(
+                status_code=_response.status_code,
+                headers=dict(_response.headers),
+                body=_response.text,
+            )
+        raise ApiError(
+            status_code=_response.status_code,
+            headers=dict(_response.headers),
+            body=_response_json,
+        )
diff --git a/langfuse/api/scores_v3/types/__init__.py b/langfuse/api/scores_v3/types/__init__.py
new file mode 100644
index 000000000..14da0ca73
--- /dev/null
+++ b/langfuse/api/scores_v3/types/__init__.py
@@ -0,0 +1,114 @@
+# This file was auto-generated by Fern from our API Definition.
+
+# isort: skip_file
+
+import typing
+from importlib import import_module
+
+if typing.TYPE_CHECKING:
+    from .base_score_v3 import BaseScoreV3
+    from .boolean_score_v3 import BooleanScoreV3
+    from .categorical_score_v3 import CategoricalScoreV3
+    from .correction_score_v3 import CorrectionScoreV3
+    from .get_scores_v3meta import GetScoresV3Meta
+    from .get_scores_v3response import GetScoresV3Response
+    from .numeric_score_v3 import NumericScoreV3
+    from .score_subject_experiment_v3 import ScoreSubjectExperimentV3
+    from .score_subject_observation_v3 import ScoreSubjectObservationV3
+    from .score_subject_session_v3 import ScoreSubjectSessionV3
+    from .score_subject_trace_v3 import ScoreSubjectTraceV3
+    from .score_subject_v3 import (
+        ScoreSubjectV3,
+        ScoreSubjectV3_Experiment,
+        ScoreSubjectV3_Observation,
+        ScoreSubjectV3_Session,
+        ScoreSubjectV3_Trace,
+    )
+    from .score_v3 import (
+        ScoreV3,
+        ScoreV3_Boolean,
+        ScoreV3_Categorical,
+        ScoreV3_Correction,
+        ScoreV3_Numeric,
+        ScoreV3_Text,
+    )
+    from .text_score_v3 import TextScoreV3
+_dynamic_imports: typing.Dict[str, str] = {
+    "BaseScoreV3": ".base_score_v3",
+    "BooleanScoreV3": ".boolean_score_v3",
+    "CategoricalScoreV3": ".categorical_score_v3",
+    "CorrectionScoreV3": ".correction_score_v3",
+    "GetScoresV3Meta": ".get_scores_v3meta",
+    "GetScoresV3Response": ".get_scores_v3response",
+    "NumericScoreV3": ".numeric_score_v3",
+    "ScoreSubjectExperimentV3": ".score_subject_experiment_v3",
+    "ScoreSubjectObservationV3": ".score_subject_observation_v3",
+    "ScoreSubjectSessionV3": ".score_subject_session_v3",
+    "ScoreSubjectTraceV3": ".score_subject_trace_v3",
+    "ScoreSubjectV3": ".score_subject_v3",
+    "ScoreSubjectV3_Experiment": ".score_subject_v3",
+    "ScoreSubjectV3_Observation": ".score_subject_v3",
+    "ScoreSubjectV3_Session": ".score_subject_v3",
+    "ScoreSubjectV3_Trace": ".score_subject_v3",
+    "ScoreV3": ".score_v3",
+    "ScoreV3_Boolean": ".score_v3",
+    "ScoreV3_Categorical": ".score_v3",
+    "ScoreV3_Correction": ".score_v3",
+    "ScoreV3_Numeric": ".score_v3",
+    "ScoreV3_Text": ".score_v3",
+    "TextScoreV3": ".text_score_v3",
+}
+
+
+def __getattr__(attr_name: str) -> typing.Any:
+    module_name = _dynamic_imports.get(attr_name)
+    if module_name is None:
+        raise AttributeError(
+            f"No {attr_name} found in _dynamic_imports for module name -> {__name__}"
+        )
+    try:
+        module = import_module(module_name, __package__)
+        if module_name == f".{attr_name}":
+            return module
+        else:
+            return getattr(module, attr_name)
+    except ImportError as e:
+        raise ImportError(
+            f"Failed to import {attr_name} from {module_name}: {e}"
+        ) from e
+    except AttributeError as e:
+        raise AttributeError(
+            f"Failed to get {attr_name} from {module_name}: {e}"
+        ) from e
+
+
+def __dir__():
+    lazy_attrs = list(_dynamic_imports.keys())
+    return sorted(lazy_attrs)
+
+
+__all__ = [
+    "BaseScoreV3",
+    "BooleanScoreV3",
+    "CategoricalScoreV3",
+    "CorrectionScoreV3",
+    "GetScoresV3Meta",
+    "GetScoresV3Response",
+    "NumericScoreV3",
+    "ScoreSubjectExperimentV3",
+    "ScoreSubjectObservationV3",
+    "ScoreSubjectSessionV3",
+    "ScoreSubjectTraceV3",
+    "ScoreSubjectV3",
+    "ScoreSubjectV3_Experiment",
+    "ScoreSubjectV3_Observation",
+    "ScoreSubjectV3_Session",
+    "ScoreSubjectV3_Trace",
+    "ScoreV3",
+    "ScoreV3_Boolean",
+    "ScoreV3_Categorical",
+    "ScoreV3_Correction",
+    "ScoreV3_Numeric",
+    "ScoreV3_Text",
+    "TextScoreV3",
+]
diff --git a/langfuse/api/scores_v3/types/base_score_v3.py b/langfuse/api/scores_v3/types/base_score_v3.py
new file mode 100644
index 000000000..3d5394f95
--- /dev/null
+++ b/langfuse/api/scores_v3/types/base_score_v3.py
@@ -0,0 +1,71 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import datetime as dt
+import typing
+
+import pydantic
+import typing_extensions
+from ...commons.types.score_source import ScoreSource
+from ...core.pydantic_utilities import UniversalBaseModel
+from ...core.serialization import FieldMetadata
+from .score_subject_v3 import ScoreSubjectV3
+
+
+class BaseScoreV3(UniversalBaseModel):
+    id: str
+    project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")]
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    environment: str = pydantic.Field()
+    """
+    The environment from which this score originated.
+    """
+
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ]
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ]
+    comment: typing.Optional[str] = pydantic.Field(default=None)
+    """
+    Optional comment attached to the score. Present when "details" is included in the fields parameter.
+    """
+
+    config_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="configId")
+    ] = pydantic.Field(default=None)
+    """
+    The score config ID, if this score was created from a config. Present when "details" is included in the fields parameter.
+    """
+
+    metadata: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(
+        default=None
+    )
+    """
+    Arbitrary metadata attached to the score. Present when "details" is included in the fields parameter.
+    """
+
+    author_user_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="authorUserId")
+    ] = pydantic.Field(default=None)
+    """
+    The user who created this score, if available. Present when "annotation" is included in the fields parameter.
+    """
+
+    queue_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="queueId")
+    ] = pydantic.Field(default=None)
+    """
+    The annotation queue this score belongs to, if any. Present when "annotation" is included in the fields parameter.
+    """
+
+    subject: typing.Optional[ScoreSubjectV3] = pydantic.Field(default=None)
+    """
+    The entity this score is attached to (trace, observation, session, or experiment). Present when "subject" is included in the fields parameter.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/boolean_score_v3.py b/langfuse/api/scores_v3/types/boolean_score_v3.py
new file mode 100644
index 000000000..5b94bc1d1
--- /dev/null
+++ b/langfuse/api/scores_v3/types/boolean_score_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from .base_score_v3 import BaseScoreV3
+
+
+class BooleanScoreV3(BaseScoreV3):
+    value: bool = pydantic.Field()
+    """
+    The boolean value of the score.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/categorical_score_v3.py b/langfuse/api/scores_v3/types/categorical_score_v3.py
new file mode 100644
index 000000000..975b1f64c
--- /dev/null
+++ b/langfuse/api/scores_v3/types/categorical_score_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from .base_score_v3 import BaseScoreV3
+
+
+class CategoricalScoreV3(BaseScoreV3):
+    value: str = pydantic.Field()
+    """
+    The string category value of the score.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/correction_score_v3.py b/langfuse/api/scores_v3/types/correction_score_v3.py
new file mode 100644
index 000000000..1717a6e67
--- /dev/null
+++ b/langfuse/api/scores_v3/types/correction_score_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from .base_score_v3 import BaseScoreV3
+
+
+class CorrectionScoreV3(BaseScoreV3):
+    value: str = pydantic.Field()
+    """
+    The correction content of the score. Empty string if not set.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/get_scores_v3meta.py b/langfuse/api/scores_v3/types/get_scores_v3meta.py
new file mode 100644
index 000000000..7dfcfe0e1
--- /dev/null
+++ b/langfuse/api/scores_v3/types/get_scores_v3meta.py
@@ -0,0 +1,18 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ...core.pydantic_utilities import UniversalBaseModel
+
+
+class GetScoresV3Meta(UniversalBaseModel):
+    limit: int
+    cursor: typing.Optional[str] = pydantic.Field(default=None)
+    """
+    URL-safe base64 (base64url) cursor for the next page. Absent when there are no more results.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/get_scores_v3response.py b/langfuse/api/scores_v3/types/get_scores_v3response.py
new file mode 100644
index 000000000..4d625b29a
--- /dev/null
+++ b/langfuse/api/scores_v3/types/get_scores_v3response.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ...core.pydantic_utilities import UniversalBaseModel
+from .get_scores_v3meta import GetScoresV3Meta
+from .score_v3 import ScoreV3
+
+
+class GetScoresV3Response(UniversalBaseModel):
+    data: typing.List[ScoreV3]
+    meta: GetScoresV3Meta
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/numeric_score_v3.py b/langfuse/api/scores_v3/types/numeric_score_v3.py
new file mode 100644
index 000000000..10df001a4
--- /dev/null
+++ b/langfuse/api/scores_v3/types/numeric_score_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from .base_score_v3 import BaseScoreV3
+
+
+class NumericScoreV3(BaseScoreV3):
+    value: float = pydantic.Field()
+    """
+    The numeric value of the score.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/score_subject_experiment_v3.py b/langfuse/api/scores_v3/types/score_subject_experiment_v3.py
new file mode 100644
index 000000000..a71a49241
--- /dev/null
+++ b/langfuse/api/scores_v3/types/score_subject_experiment_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ...core.pydantic_utilities import UniversalBaseModel
+
+
+class ScoreSubjectExperimentV3(UniversalBaseModel):
+    id: str = pydantic.Field()
+    """
+    The dataset run ID (experiment ID).
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/score_subject_observation_v3.py b/langfuse/api/scores_v3/types/score_subject_observation_v3.py
new file mode 100644
index 000000000..1bc2edf20
--- /dev/null
+++ b/langfuse/api/scores_v3/types/score_subject_observation_v3.py
@@ -0,0 +1,26 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+import typing_extensions
+from ...core.pydantic_utilities import UniversalBaseModel
+from ...core.serialization import FieldMetadata
+
+
+class ScoreSubjectObservationV3(UniversalBaseModel):
+    id: str = pydantic.Field()
+    """
+    The observation ID.
+    """
+
+    trace_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="traceId")
+    ] = pydantic.Field(default=None)
+    """
+    The parent trace ID, if available.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/score_subject_session_v3.py b/langfuse/api/scores_v3/types/score_subject_session_v3.py
new file mode 100644
index 000000000..cb9347583
--- /dev/null
+++ b/langfuse/api/scores_v3/types/score_subject_session_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ...core.pydantic_utilities import UniversalBaseModel
+
+
+class ScoreSubjectSessionV3(UniversalBaseModel):
+    id: str = pydantic.Field()
+    """
+    The session ID.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/score_subject_trace_v3.py b/langfuse/api/scores_v3/types/score_subject_trace_v3.py
new file mode 100644
index 000000000..26aab7f07
--- /dev/null
+++ b/langfuse/api/scores_v3/types/score_subject_trace_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ...core.pydantic_utilities import UniversalBaseModel
+
+
+class ScoreSubjectTraceV3(UniversalBaseModel):
+    id: str = pydantic.Field()
+    """
+    The trace ID.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/scores_v3/types/score_subject_v3.py b/langfuse/api/scores_v3/types/score_subject_v3.py
new file mode 100644
index 000000000..7464fda55
--- /dev/null
+++ b/langfuse/api/scores_v3/types/score_subject_v3.py
@@ -0,0 +1,76 @@
+# This file was auto-generated by Fern from our API Definition.
+
+from __future__ import annotations
+
+import typing
+
+import pydantic
+import typing_extensions
+from ...core.pydantic_utilities import UniversalBaseModel
+from ...core.serialization import FieldMetadata
+
+
+class ScoreSubjectV3_Trace(UniversalBaseModel):
+    """
+    A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment.
+    """
+
+    kind: typing.Literal["trace"] = "trace"
+    id: str
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class ScoreSubjectV3_Observation(UniversalBaseModel):
+    """
+    A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment.
+    """
+
+    kind: typing.Literal["observation"] = "observation"
+    id: str
+    trace_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="traceId")
+    ] = None
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class ScoreSubjectV3_Session(UniversalBaseModel):
+    """
+    A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment.
+    """
+
+    kind: typing.Literal["session"] = "session"
+    id: str
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class ScoreSubjectV3_Experiment(UniversalBaseModel):
+    """
+    A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment.
+    """
+
+    kind: typing.Literal["experiment"] = "experiment"
+    id: str
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+ScoreSubjectV3 = typing_extensions.Annotated[
+    typing.Union[
+        ScoreSubjectV3_Trace,
+        ScoreSubjectV3_Observation,
+        ScoreSubjectV3_Session,
+        ScoreSubjectV3_Experiment,
+    ],
+    pydantic.Field(discriminator="kind"),
+]
diff --git a/langfuse/api/scores_v3/types/score_v3.py b/langfuse/api/scores_v3/types/score_v3.py
new file mode 100644
index 000000000..9921d1bda
--- /dev/null
+++ b/langfuse/api/scores_v3/types/score_v3.py
@@ -0,0 +1,200 @@
+# This file was auto-generated by Fern from our API Definition.
+
+from __future__ import annotations
+
+import datetime as dt
+import typing
+
+import pydantic
+import typing_extensions
+from ...commons.types.score_source import ScoreSource
+from ...core.pydantic_utilities import UniversalBaseModel
+from ...core.serialization import FieldMetadata
+from .score_subject_v3 import ScoreSubjectV3
+
+
+class ScoreV3_Numeric(UniversalBaseModel):
+    data_type: typing_extensions.Annotated[
+        typing.Literal["NUMERIC"], FieldMetadata(alias="dataType")
+    ] = "NUMERIC"
+    value: float
+    id: str
+    project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")]
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    environment: str
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ]
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ]
+    comment: typing.Optional[str] = None
+    config_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="configId")
+    ] = None
+    metadata: typing.Optional[typing.Dict[str, typing.Any]] = None
+    author_user_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="authorUserId")
+    ] = None
+    queue_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="queueId")
+    ] = None
+    subject: typing.Optional[ScoreSubjectV3] = None
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class ScoreV3_Boolean(UniversalBaseModel):
+    data_type: typing_extensions.Annotated[
+        typing.Literal["BOOLEAN"], FieldMetadata(alias="dataType")
+    ] = "BOOLEAN"
+    value: bool
+    id: str
+    project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")]
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    environment: str
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ]
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ]
+    comment: typing.Optional[str] = None
+    config_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="configId")
+    ] = None
+    metadata: typing.Optional[typing.Dict[str, typing.Any]] = None
+    author_user_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="authorUserId")
+    ] = None
+    queue_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="queueId")
+    ] = None
+    subject: typing.Optional[ScoreSubjectV3] = None
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class ScoreV3_Categorical(UniversalBaseModel):
+    data_type: typing_extensions.Annotated[
+        typing.Literal["CATEGORICAL"], FieldMetadata(alias="dataType")
+    ] = "CATEGORICAL"
+    value: str
+    id: str
+    project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")]
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    environment: str
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ]
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ]
+    comment: typing.Optional[str] = None
+    config_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="configId")
+    ] = None
+    metadata: typing.Optional[typing.Dict[str, typing.Any]] = None
+    author_user_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="authorUserId")
+    ] = None
+    queue_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="queueId")
+    ] = None
+    subject: typing.Optional[ScoreSubjectV3] = None
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class ScoreV3_Text(UniversalBaseModel):
+    data_type: typing_extensions.Annotated[
+        typing.Literal["TEXT"], FieldMetadata(alias="dataType")
+    ] = "TEXT"
+    value: str
+    id: str
+    project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")]
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    environment: str
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ]
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ]
+    comment: typing.Optional[str] = None
+    config_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="configId")
+    ] = None
+    metadata: typing.Optional[typing.Dict[str, typing.Any]] = None
+    author_user_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="authorUserId")
+    ] = None
+    queue_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="queueId")
+    ] = None
+    subject: typing.Optional[ScoreSubjectV3] = None
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class ScoreV3_Correction(UniversalBaseModel):
+    data_type: typing_extensions.Annotated[
+        typing.Literal["CORRECTION"], FieldMetadata(alias="dataType")
+    ] = "CORRECTION"
+    value: str
+    id: str
+    project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")]
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    environment: str
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ]
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ]
+    comment: typing.Optional[str] = None
+    config_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="configId")
+    ] = None
+    metadata: typing.Optional[typing.Dict[str, typing.Any]] = None
+    author_user_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="authorUserId")
+    ] = None
+    queue_id: typing_extensions.Annotated[
+        typing.Optional[str], FieldMetadata(alias="queueId")
+    ] = None
+    subject: typing.Optional[ScoreSubjectV3] = None
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+ScoreV3 = typing_extensions.Annotated[
+    typing.Union[
+        ScoreV3_Numeric,
+        ScoreV3_Boolean,
+        ScoreV3_Categorical,
+        ScoreV3_Text,
+        ScoreV3_Correction,
+    ],
+    pydantic.Field(discriminator="data_type"),
+]
diff --git a/langfuse/api/scores_v3/types/text_score_v3.py b/langfuse/api/scores_v3/types/text_score_v3.py
new file mode 100644
index 000000000..3d658972c
--- /dev/null
+++ b/langfuse/api/scores_v3/types/text_score_v3.py
@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from .base_score_v3 import BaseScoreV3
+
+
+class TextScoreV3(BaseScoreV3):
+    value: str = pydantic.Field()
+    """
+    The text content of the score.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/__init__.py b/langfuse/api/unstable/__init__.py
index 75aafdc24..e5356c235 100644
--- a/langfuse/api/unstable/__init__.py
+++ b/langfuse/api/unstable/__init__.py
@@ -26,6 +26,7 @@
         ArrayOptionsEvaluationRuleFilter,
         BooleanEvaluationRuleFilter,
         CategoryOptionsEvaluationRuleFilter,
+        CodeEvaluatorSourceCodeLanguage,
         DateTimeEvaluationRuleFilter,
         EvaluationRuleArrayOptionsFilterOperator,
         EvaluationRuleBooleanFilterOperator,
@@ -73,24 +74,51 @@
         StringOptionsEvaluationRuleFilter,
     )
     from .evaluation_rules import (
+        CodeEvaluationRuleEvaluatorReference,
+        CreateCodeEvaluationRuleRequest,
         CreateEvaluationRuleRequest,
+        CreateLlmAsJudgeEvaluationRuleRequest,
         DeleteEvaluationRuleResponse,
         EvaluationRule,
         EvaluationRuleEvaluator,
         EvaluationRuleEvaluatorReference,
         EvaluationRules,
+        LlmAsJudgeEvaluationRuleEvaluatorReference,
+        LlmAsJudgeEvaluatorType,
         UpdateEvaluationRuleRequest,
     )
-    from .evaluators import CreateEvaluatorRequest, Evaluator, Evaluators
+    from .evaluators import (
+        CodeEvaluator,
+        CreateCodeEvaluatorRequest,
+        CreateEvaluatorRequest,
+        CreateEvaluatorRequest_Code,
+        CreateEvaluatorRequest_LlmAsJudge,
+        CreateLlmAsJudgeEvaluatorRequest,
+        Evaluator,
+        EvaluatorBase,
+        Evaluator_Code,
+        Evaluator_LlmAsJudge,
+        Evaluators,
+        LlmAsJudgeEvaluator,
+    )
 _dynamic_imports: typing.Dict[str, str] = {
     "AccessDeniedError": ".errors",
     "ArrayOptionsEvaluationRuleFilter": ".commons",
     "BadRequestError": ".errors",
     "BooleanEvaluationRuleFilter": ".commons",
     "CategoryOptionsEvaluationRuleFilter": ".commons",
+    "CodeEvaluationRuleEvaluatorReference": ".evaluation_rules",
+    "CodeEvaluator": ".evaluators",
+    "CodeEvaluatorSourceCodeLanguage": ".commons",
     "ConflictError": ".errors",
+    "CreateCodeEvaluationRuleRequest": ".evaluation_rules",
+    "CreateCodeEvaluatorRequest": ".evaluators",
     "CreateEvaluationRuleRequest": ".evaluation_rules",
     "CreateEvaluatorRequest": ".evaluators",
+    "CreateEvaluatorRequest_Code": ".evaluators",
+    "CreateEvaluatorRequest_LlmAsJudge": ".evaluators",
+    "CreateLlmAsJudgeEvaluationRuleRequest": ".evaluation_rules",
+    "CreateLlmAsJudgeEvaluatorRequest": ".evaluators",
     "DateTimeEvaluationRuleFilter": ".commons",
     "DeleteEvaluationRuleResponse": ".evaluation_rules",
     "EvaluationRule": ".evaluation_rules",
@@ -119,6 +147,7 @@
     "EvaluationRuleTarget": ".commons",
     "EvaluationRules": ".evaluation_rules",
     "Evaluator": ".evaluators",
+    "EvaluatorBase": ".evaluators",
     "EvaluatorModelConfig": ".commons",
     "EvaluatorOutputDataType": ".commons",
     "EvaluatorOutputDefinition": ".commons",
@@ -128,8 +157,13 @@
     "EvaluatorOutputFieldDefinition": ".commons",
     "EvaluatorScope": ".commons",
     "EvaluatorType": ".commons",
+    "Evaluator_Code": ".evaluators",
+    "Evaluator_LlmAsJudge": ".evaluators",
     "Evaluators": ".evaluators",
     "InternalServerError": ".errors",
+    "LlmAsJudgeEvaluationRuleEvaluatorReference": ".evaluation_rules",
+    "LlmAsJudgeEvaluator": ".evaluators",
+    "LlmAsJudgeEvaluatorType": ".evaluation_rules",
     "MethodNotAllowedError": ".errors",
     "NotFoundError": ".errors",
     "NullEvaluationRuleFilter": ".commons",
@@ -194,9 +228,18 @@ def __dir__():
     "BadRequestError",
     "BooleanEvaluationRuleFilter",
     "CategoryOptionsEvaluationRuleFilter",
+    "CodeEvaluationRuleEvaluatorReference",
+    "CodeEvaluator",
+    "CodeEvaluatorSourceCodeLanguage",
     "ConflictError",
+    "CreateCodeEvaluationRuleRequest",
+    "CreateCodeEvaluatorRequest",
     "CreateEvaluationRuleRequest",
     "CreateEvaluatorRequest",
+    "CreateEvaluatorRequest_Code",
+    "CreateEvaluatorRequest_LlmAsJudge",
+    "CreateLlmAsJudgeEvaluationRuleRequest",
+    "CreateLlmAsJudgeEvaluatorRequest",
     "DateTimeEvaluationRuleFilter",
     "DeleteEvaluationRuleResponse",
     "EvaluationRule",
@@ -225,6 +268,7 @@ def __dir__():
     "EvaluationRuleTarget",
     "EvaluationRules",
     "Evaluator",
+    "EvaluatorBase",
     "EvaluatorModelConfig",
     "EvaluatorOutputDataType",
     "EvaluatorOutputDefinition",
@@ -234,8 +278,13 @@ def __dir__():
     "EvaluatorOutputFieldDefinition",
     "EvaluatorScope",
     "EvaluatorType",
+    "Evaluator_Code",
+    "Evaluator_LlmAsJudge",
     "Evaluators",
     "InternalServerError",
+    "LlmAsJudgeEvaluationRuleEvaluatorReference",
+    "LlmAsJudgeEvaluator",
+    "LlmAsJudgeEvaluatorType",
     "MethodNotAllowedError",
     "NotFoundError",
     "NullEvaluationRuleFilter",
diff --git a/langfuse/api/unstable/commons/__init__.py b/langfuse/api/unstable/commons/__init__.py
index 13d9571ff..c617b53c7 100644
--- a/langfuse/api/unstable/commons/__init__.py
+++ b/langfuse/api/unstable/commons/__init__.py
@@ -10,6 +10,7 @@
         ArrayOptionsEvaluationRuleFilter,
         BooleanEvaluationRuleFilter,
         CategoryOptionsEvaluationRuleFilter,
+        CodeEvaluatorSourceCodeLanguage,
         DateTimeEvaluationRuleFilter,
         EvaluationRuleArrayOptionsFilterOperator,
         EvaluationRuleBooleanFilterOperator,
@@ -60,6 +61,7 @@
     "ArrayOptionsEvaluationRuleFilter": ".types",
     "BooleanEvaluationRuleFilter": ".types",
     "CategoryOptionsEvaluationRuleFilter": ".types",
+    "CodeEvaluatorSourceCodeLanguage": ".types",
     "DateTimeEvaluationRuleFilter": ".types",
     "EvaluationRuleArrayOptionsFilterOperator": ".types",
     "EvaluationRuleBooleanFilterOperator": ".types",
@@ -139,6 +141,7 @@ def __dir__():
     "ArrayOptionsEvaluationRuleFilter",
     "BooleanEvaluationRuleFilter",
     "CategoryOptionsEvaluationRuleFilter",
+    "CodeEvaluatorSourceCodeLanguage",
     "DateTimeEvaluationRuleFilter",
     "EvaluationRuleArrayOptionsFilterOperator",
     "EvaluationRuleBooleanFilterOperator",
diff --git a/langfuse/api/unstable/commons/types/__init__.py b/langfuse/api/unstable/commons/types/__init__.py
index a0e7d9f9d..487480da4 100644
--- a/langfuse/api/unstable/commons/types/__init__.py
+++ b/langfuse/api/unstable/commons/types/__init__.py
@@ -11,6 +11,7 @@
     from .category_options_evaluation_rule_filter import (
         CategoryOptionsEvaluationRuleFilter,
     )
+    from .code_evaluator_source_code_language import CodeEvaluatorSourceCodeLanguage
     from .date_time_evaluation_rule_filter import DateTimeEvaluationRuleFilter
     from .evaluation_rule_array_options_filter_operator import (
         EvaluationRuleArrayOptionsFilterOperator,
@@ -84,6 +85,7 @@
     "ArrayOptionsEvaluationRuleFilter": ".array_options_evaluation_rule_filter",
     "BooleanEvaluationRuleFilter": ".boolean_evaluation_rule_filter",
     "CategoryOptionsEvaluationRuleFilter": ".category_options_evaluation_rule_filter",
+    "CodeEvaluatorSourceCodeLanguage": ".code_evaluator_source_code_language",
     "DateTimeEvaluationRuleFilter": ".date_time_evaluation_rule_filter",
     "EvaluationRuleArrayOptionsFilterOperator": ".evaluation_rule_array_options_filter_operator",
     "EvaluationRuleBooleanFilterOperator": ".evaluation_rule_boolean_filter_operator",
@@ -163,6 +165,7 @@ def __dir__():
     "ArrayOptionsEvaluationRuleFilter",
     "BooleanEvaluationRuleFilter",
     "CategoryOptionsEvaluationRuleFilter",
+    "CodeEvaluatorSourceCodeLanguage",
     "DateTimeEvaluationRuleFilter",
     "EvaluationRuleArrayOptionsFilterOperator",
     "EvaluationRuleBooleanFilterOperator",
diff --git a/langfuse/api/unstable/commons/types/code_evaluator_source_code_language.py b/langfuse/api/unstable/commons/types/code_evaluator_source_code_language.py
new file mode 100644
index 000000000..7071a317c
--- /dev/null
+++ b/langfuse/api/unstable/commons/types/code_evaluator_source_code_language.py
@@ -0,0 +1,26 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+from ....core import enum
+
+T_Result = typing.TypeVar("T_Result")
+
+
+class CodeEvaluatorSourceCodeLanguage(enum.StrEnum):
+    """
+    Code evaluator runtime language.
+    """
+
+    PYTHON = "PYTHON"
+    TYPESCRIPT = "TYPESCRIPT"
+
+    def visit(
+        self,
+        python: typing.Callable[[], T_Result],
+        typescript: typing.Callable[[], T_Result],
+    ) -> T_Result:
+        if self is CodeEvaluatorSourceCodeLanguage.PYTHON:
+            return python()
+        if self is CodeEvaluatorSourceCodeLanguage.TYPESCRIPT:
+            return typescript()
diff --git a/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py b/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py
index 1c407819c..cd08e8b33 100644
--- a/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py
+++ b/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py
@@ -11,7 +11,9 @@
 
 class EvaluationRuleMapping(UniversalBaseModel):
     """
-    Maps one evaluator prompt variable to one source field from the target object.
+    Maps one evaluator variable to one source field from the target object.
+
+    Manual mappings are used for `llm_as_judge` evaluators. `code` evaluators use a fixed runtime mapping managed by Langfuse.
 
     How to build a valid mapping list:
     1. Create the evaluator or fetch it with `GET /evaluators/{id}`.
@@ -24,7 +26,7 @@ class EvaluationRuleMapping(UniversalBaseModel):
 
     Recovery guidance:
     - `invalid_variable_mapping`: the variable name is unknown for this evaluator, or the selected `source` is not valid for the chosen `target`
-    - `missing_variable_mapping`: one or more evaluator variables are not mapped yet
+    - `missing_variable_mapping`: one or more LLM-as-judge evaluator variables are not mapped yet
     - `duplicate_variable_mapping`: the same evaluator variable appears more than once
     - `invalid_json_path`: the JSONPath expression is malformed. Remove it or correct it.
 
diff --git a/langfuse/api/unstable/commons/types/evaluator_type.py b/langfuse/api/unstable/commons/types/evaluator_type.py
index d411d6111..f219fb7e1 100644
--- a/langfuse/api/unstable/commons/types/evaluator_type.py
+++ b/langfuse/api/unstable/commons/types/evaluator_type.py
@@ -11,11 +11,18 @@ class EvaluatorType(enum.StrEnum):
     """
     The evaluator engine type.
 
-    The unstable public API currently supports only LLM-as-a-judge evaluators.
+    The unstable public API supports LLM-as-a-judge and code evaluators.
     """
 
     LLM_AS_JUDGE = "llm_as_judge"
+    CODE = "code"
 
-    def visit(self, llm_as_judge: typing.Callable[[], T_Result]) -> T_Result:
+    def visit(
+        self,
+        llm_as_judge: typing.Callable[[], T_Result],
+        code: typing.Callable[[], T_Result],
+    ) -> T_Result:
         if self is EvaluatorType.LLM_AS_JUDGE:
             return llm_as_judge()
+        if self is EvaluatorType.CODE:
+            return code()
diff --git a/langfuse/api/unstable/evaluation_rules/__init__.py b/langfuse/api/unstable/evaluation_rules/__init__.py
index f0c007231..8541bdcc8 100644
--- a/langfuse/api/unstable/evaluation_rules/__init__.py
+++ b/langfuse/api/unstable/evaluation_rules/__init__.py
@@ -7,21 +7,31 @@
 
 if typing.TYPE_CHECKING:
     from .types import (
+        CodeEvaluationRuleEvaluatorReference,
+        CreateCodeEvaluationRuleRequest,
         CreateEvaluationRuleRequest,
+        CreateLlmAsJudgeEvaluationRuleRequest,
         DeleteEvaluationRuleResponse,
         EvaluationRule,
         EvaluationRuleEvaluator,
         EvaluationRuleEvaluatorReference,
         EvaluationRules,
+        LlmAsJudgeEvaluationRuleEvaluatorReference,
+        LlmAsJudgeEvaluatorType,
         UpdateEvaluationRuleRequest,
     )
 _dynamic_imports: typing.Dict[str, str] = {
+    "CodeEvaluationRuleEvaluatorReference": ".types",
+    "CreateCodeEvaluationRuleRequest": ".types",
     "CreateEvaluationRuleRequest": ".types",
+    "CreateLlmAsJudgeEvaluationRuleRequest": ".types",
     "DeleteEvaluationRuleResponse": ".types",
     "EvaluationRule": ".types",
     "EvaluationRuleEvaluator": ".types",
     "EvaluationRuleEvaluatorReference": ".types",
     "EvaluationRules": ".types",
+    "LlmAsJudgeEvaluationRuleEvaluatorReference": ".types",
+    "LlmAsJudgeEvaluatorType": ".types",
     "UpdateEvaluationRuleRequest": ".types",
 }
 
@@ -54,11 +64,16 @@ def __dir__():
 
 
 __all__ = [
+    "CodeEvaluationRuleEvaluatorReference",
+    "CreateCodeEvaluationRuleRequest",
     "CreateEvaluationRuleRequest",
+    "CreateLlmAsJudgeEvaluationRuleRequest",
     "DeleteEvaluationRuleResponse",
     "EvaluationRule",
     "EvaluationRuleEvaluator",
     "EvaluationRuleEvaluatorReference",
     "EvaluationRules",
+    "LlmAsJudgeEvaluationRuleEvaluatorReference",
+    "LlmAsJudgeEvaluatorType",
     "UpdateEvaluationRuleRequest",
 ]
diff --git a/langfuse/api/unstable/evaluation_rules/client.py b/langfuse/api/unstable/evaluation_rules/client.py
index 20e56e6c3..aa0cefbdf 100644
--- a/langfuse/api/unstable/evaluation_rules/client.py
+++ b/langfuse/api/unstable/evaluation_rules/client.py
@@ -8,6 +8,7 @@
 from ..commons.types.evaluation_rule_mapping import EvaluationRuleMapping
 from ..commons.types.evaluation_rule_target import EvaluationRuleTarget
 from .raw_client import AsyncRawEvaluationRulesClient, RawEvaluationRulesClient
+from .types.create_evaluation_rule_request import CreateEvaluationRuleRequest
 from .types.delete_evaluation_rule_response import DeleteEvaluationRuleResponse
 from .types.evaluation_rule import EvaluationRule
 from .types.evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference
@@ -35,13 +36,7 @@ def with_raw_response(self) -> RawEvaluationRulesClient:
     def create(
         self,
         *,
-        name: str,
-        evaluator: EvaluationRuleEvaluatorReference,
-        target: EvaluationRuleTarget,
-        enabled: bool,
-        mapping: typing.Sequence[EvaluationRuleMapping],
-        sampling: typing.Optional[float] = OMIT,
-        filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT,
+        request: CreateEvaluationRuleRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> EvaluationRule:
         """
@@ -57,8 +52,9 @@ def create(
         - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints
         - Langfuse resolves that family to its latest version before saving the evaluation rule
         - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId`
-        - every evaluator prompt variable must be mapped exactly once
-        - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment`
+        - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once
+        - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests
+        - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment`
         - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run
         - at most 50 evaluation rules can be effectively active in one project at the same time
 
@@ -75,44 +71,15 @@ def create(
         Recovery guidance:
         - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues`
         - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response
-        - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
+        - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
         - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable
-        - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name
+        - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name
         - `400 invalid_json_path`: remove or correct the `jsonPath`
         - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request.
 
         Parameters
         ----------
-        name : str
-            Human-readable deployment name.
-
-        evaluator : EvaluationRuleEvaluatorReference
-            Evaluator family to use.
-
-            Use `name` and `scope` from the evaluator endpoints.
-            Langfuse resolves that family to its latest version before saving the rule.
-
-        target : EvaluationRuleTarget
-            Target object type to evaluate.
-
-        enabled : bool
-            Whether the deployment should be active immediately after creation.
-
-        mapping : typing.Sequence[EvaluationRuleMapping]
-            Required variable mappings.
-
-            Every evaluator variable must appear exactly once.
-            Build this list from the evaluator `variables` array returned by the evaluator endpoints.
-
-        sampling : typing.Optional[float]
-            Optional sampling fraction. Defaults to `1`.
-
-        filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]]
-            Optional filter list.
-
-            Omit or pass an empty list to evaluate all matching targets for the selected `target`.
-            Each filter object must use a column that is valid for that `target`.
-            For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
+        request : CreateEvaluationRuleRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -132,7 +99,11 @@ def create(
             EvaluationRuleTarget,
             EvaluatorScope,
         )
-        from langfuse.unstable.evaluation_rules import EvaluationRuleEvaluatorReference
+        from langfuse.unstable.evaluation_rules import (
+            CreateLlmAsJudgeEvaluationRuleRequest,
+            LlmAsJudgeEvaluationRuleEvaluatorReference,
+            LlmAsJudgeEvaluatorType,
+        )
 
         client = LangfuseAPI(
             x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME",
@@ -143,42 +114,38 @@ def create(
             base_url="https://yourhost.com/path/to/api",
         )
         client.unstable.evaluation_rules.create(
-            name="answer-correctness-live",
-            evaluator=EvaluationRuleEvaluatorReference(
-                name="answer-correctness",
-                scope=EvaluatorScope.PROJECT,
-            ),
-            target=EvaluationRuleTarget.OBSERVATION,
-            enabled=True,
-            sampling=1.0,
-            filter=[
-                EvaluationRuleFilter_StringOptions(
-                    column="type",
-                    operator=EvaluationRuleOptionsFilterOperator.ANY_OF,
-                    value=["GENERATION"],
-                )
-            ],
-            mapping=[
-                EvaluationRuleMapping(
-                    variable="input",
-                    source=EvaluationRuleMappingSource.INPUT,
-                ),
-                EvaluationRuleMapping(
-                    variable="output",
-                    source=EvaluationRuleMappingSource.OUTPUT,
+            request=CreateLlmAsJudgeEvaluationRuleRequest(
+                name="answer-correctness-live",
+                evaluator=LlmAsJudgeEvaluationRuleEvaluatorReference(
+                    name="answer-correctness",
+                    scope=EvaluatorScope.PROJECT,
+                    type=LlmAsJudgeEvaluatorType.LLM_AS_JUDGE,
                 ),
-            ],
+                target=EvaluationRuleTarget.OBSERVATION,
+                enabled=True,
+                sampling=1.0,
+                filter=[
+                    EvaluationRuleFilter_StringOptions(
+                        column="type",
+                        operator=EvaluationRuleOptionsFilterOperator.ANY_OF,
+                        value=["GENERATION"],
+                    )
+                ],
+                mapping=[
+                    EvaluationRuleMapping(
+                        variable="input",
+                        source=EvaluationRuleMappingSource.INPUT,
+                    ),
+                    EvaluationRuleMapping(
+                        variable="output",
+                        source=EvaluationRuleMappingSource.OUTPUT,
+                    ),
+                ],
+            ),
         )
         """
         _response = self._raw_client.create(
-            name=name,
-            evaluator=evaluator,
-            target=target,
-            enabled=enabled,
-            mapping=mapping,
-            sampling=sampling,
-            filter=filter,
-            request_options=request_options,
+            request=request, request_options=request_options
         )
         return _response.data
 
@@ -293,18 +260,19 @@ def update(
         - switch to another evaluator
         - adjust sampling
         - change filters
-        - update variable mappings
+        - update LLM-as-judge variable mappings
 
         Important behavior:
         - provide only the fields you want to change
         - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving
-        - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration
-        - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration
+        - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically
         - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run
         - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409`
 
         Recovery guidance:
-        - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
+        - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
         - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter`
 
         Parameters
@@ -319,6 +287,7 @@ def update(
             Updated evaluator family.
 
             Langfuse resolves the provided evaluator family to its latest version before saving the rule.
+            A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule.
 
         target : typing.Optional[EvaluationRuleTarget]
             Updated target object type.
@@ -335,7 +304,9 @@ def update(
             For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
 
         mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]]
-            Updated variable mappings.
+            Updated LLM-as-judge variable mappings.
+
+            Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -436,13 +407,7 @@ def with_raw_response(self) -> AsyncRawEvaluationRulesClient:
     async def create(
         self,
         *,
-        name: str,
-        evaluator: EvaluationRuleEvaluatorReference,
-        target: EvaluationRuleTarget,
-        enabled: bool,
-        mapping: typing.Sequence[EvaluationRuleMapping],
-        sampling: typing.Optional[float] = OMIT,
-        filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT,
+        request: CreateEvaluationRuleRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> EvaluationRule:
         """
@@ -458,8 +423,9 @@ async def create(
         - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints
         - Langfuse resolves that family to its latest version before saving the evaluation rule
         - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId`
-        - every evaluator prompt variable must be mapped exactly once
-        - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment`
+        - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once
+        - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests
+        - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment`
         - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run
         - at most 50 evaluation rules can be effectively active in one project at the same time
 
@@ -476,44 +442,15 @@ async def create(
         Recovery guidance:
         - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues`
         - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response
-        - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
+        - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
         - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable
-        - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name
+        - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name
         - `400 invalid_json_path`: remove or correct the `jsonPath`
         - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request.
 
         Parameters
         ----------
-        name : str
-            Human-readable deployment name.
-
-        evaluator : EvaluationRuleEvaluatorReference
-            Evaluator family to use.
-
-            Use `name` and `scope` from the evaluator endpoints.
-            Langfuse resolves that family to its latest version before saving the rule.
-
-        target : EvaluationRuleTarget
-            Target object type to evaluate.
-
-        enabled : bool
-            Whether the deployment should be active immediately after creation.
-
-        mapping : typing.Sequence[EvaluationRuleMapping]
-            Required variable mappings.
-
-            Every evaluator variable must appear exactly once.
-            Build this list from the evaluator `variables` array returned by the evaluator endpoints.
-
-        sampling : typing.Optional[float]
-            Optional sampling fraction. Defaults to `1`.
-
-        filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]]
-            Optional filter list.
-
-            Omit or pass an empty list to evaluate all matching targets for the selected `target`.
-            Each filter object must use a column that is valid for that `target`.
-            For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
+        request : CreateEvaluationRuleRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -535,7 +472,11 @@ async def create(
             EvaluationRuleTarget,
             EvaluatorScope,
         )
-        from langfuse.unstable.evaluation_rules import EvaluationRuleEvaluatorReference
+        from langfuse.unstable.evaluation_rules import (
+            CreateLlmAsJudgeEvaluationRuleRequest,
+            LlmAsJudgeEvaluationRuleEvaluatorReference,
+            LlmAsJudgeEvaluatorType,
+        )
 
         client = AsyncLangfuseAPI(
             x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME",
@@ -549,45 +490,41 @@ async def create(
 
         async def main() -> None:
             await client.unstable.evaluation_rules.create(
-                name="answer-correctness-live",
-                evaluator=EvaluationRuleEvaluatorReference(
-                    name="answer-correctness",
-                    scope=EvaluatorScope.PROJECT,
-                ),
-                target=EvaluationRuleTarget.OBSERVATION,
-                enabled=True,
-                sampling=1.0,
-                filter=[
-                    EvaluationRuleFilter_StringOptions(
-                        column="type",
-                        operator=EvaluationRuleOptionsFilterOperator.ANY_OF,
-                        value=["GENERATION"],
-                    )
-                ],
-                mapping=[
-                    EvaluationRuleMapping(
-                        variable="input",
-                        source=EvaluationRuleMappingSource.INPUT,
-                    ),
-                    EvaluationRuleMapping(
-                        variable="output",
-                        source=EvaluationRuleMappingSource.OUTPUT,
+                request=CreateLlmAsJudgeEvaluationRuleRequest(
+                    name="answer-correctness-live",
+                    evaluator=LlmAsJudgeEvaluationRuleEvaluatorReference(
+                        name="answer-correctness",
+                        scope=EvaluatorScope.PROJECT,
+                        type=LlmAsJudgeEvaluatorType.LLM_AS_JUDGE,
                     ),
-                ],
+                    target=EvaluationRuleTarget.OBSERVATION,
+                    enabled=True,
+                    sampling=1.0,
+                    filter=[
+                        EvaluationRuleFilter_StringOptions(
+                            column="type",
+                            operator=EvaluationRuleOptionsFilterOperator.ANY_OF,
+                            value=["GENERATION"],
+                        )
+                    ],
+                    mapping=[
+                        EvaluationRuleMapping(
+                            variable="input",
+                            source=EvaluationRuleMappingSource.INPUT,
+                        ),
+                        EvaluationRuleMapping(
+                            variable="output",
+                            source=EvaluationRuleMappingSource.OUTPUT,
+                        ),
+                    ],
+                ),
             )
 
 
         asyncio.run(main())
         """
         _response = await self._raw_client.create(
-            name=name,
-            evaluator=evaluator,
-            target=target,
-            enabled=enabled,
-            mapping=mapping,
-            sampling=sampling,
-            filter=filter,
-            request_options=request_options,
+            request=request, request_options=request_options
         )
         return _response.data
 
@@ -718,18 +655,19 @@ async def update(
         - switch to another evaluator
         - adjust sampling
         - change filters
-        - update variable mappings
+        - update LLM-as-judge variable mappings
 
         Important behavior:
         - provide only the fields you want to change
         - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving
-        - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration
-        - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration
+        - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically
         - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run
         - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409`
 
         Recovery guidance:
-        - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
+        - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
         - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter`
 
         Parameters
@@ -744,6 +682,7 @@ async def update(
             Updated evaluator family.
 
             Langfuse resolves the provided evaluator family to its latest version before saving the rule.
+            A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule.
 
         target : typing.Optional[EvaluationRuleTarget]
             Updated target object type.
@@ -760,7 +699,9 @@ async def update(
             For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
 
         mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]]
-            Updated variable mappings.
+            Updated LLM-as-judge variable mappings.
+
+            Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
diff --git a/langfuse/api/unstable/evaluation_rules/raw_client.py b/langfuse/api/unstable/evaluation_rules/raw_client.py
index f99aba663..7115cbe70 100644
--- a/langfuse/api/unstable/evaluation_rules/raw_client.py
+++ b/langfuse/api/unstable/evaluation_rules/raw_client.py
@@ -44,6 +44,7 @@
 )
 from ..errors.errors.unprocessable_content_error import UnprocessableContentError
 from ..errors.types.public_api_error import PublicApiError
+from .types.create_evaluation_rule_request import CreateEvaluationRuleRequest
 from .types.delete_evaluation_rule_response import DeleteEvaluationRuleResponse
 from .types.evaluation_rule import EvaluationRule
 from .types.evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference
@@ -60,13 +61,7 @@ def __init__(self, *, client_wrapper: SyncClientWrapper):
     def create(
         self,
         *,
-        name: str,
-        evaluator: EvaluationRuleEvaluatorReference,
-        target: EvaluationRuleTarget,
-        enabled: bool,
-        mapping: typing.Sequence[EvaluationRuleMapping],
-        sampling: typing.Optional[float] = OMIT,
-        filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT,
+        request: CreateEvaluationRuleRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> HttpResponse[EvaluationRule]:
         """
@@ -82,8 +77,9 @@ def create(
         - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints
         - Langfuse resolves that family to its latest version before saving the evaluation rule
         - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId`
-        - every evaluator prompt variable must be mapped exactly once
-        - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment`
+        - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once
+        - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests
+        - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment`
         - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run
         - at most 50 evaluation rules can be effectively active in one project at the same time
 
@@ -100,44 +96,15 @@ def create(
         Recovery guidance:
         - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues`
         - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response
-        - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
+        - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
         - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable
-        - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name
+        - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name
         - `400 invalid_json_path`: remove or correct the `jsonPath`
         - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request.
 
         Parameters
         ----------
-        name : str
-            Human-readable deployment name.
-
-        evaluator : EvaluationRuleEvaluatorReference
-            Evaluator family to use.
-
-            Use `name` and `scope` from the evaluator endpoints.
-            Langfuse resolves that family to its latest version before saving the rule.
-
-        target : EvaluationRuleTarget
-            Target object type to evaluate.
-
-        enabled : bool
-            Whether the deployment should be active immediately after creation.
-
-        mapping : typing.Sequence[EvaluationRuleMapping]
-            Required variable mappings.
-
-            Every evaluator variable must appear exactly once.
-            Build this list from the evaluator `variables` array returned by the evaluator endpoints.
-
-        sampling : typing.Optional[float]
-            Optional sampling fraction. Defaults to `1`.
-
-        filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]]
-            Optional filter list.
-
-            Omit or pass an empty list to evaluate all matching targets for the selected `target`.
-            Each filter object must use a column that is valid for that `target`.
-            For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
+        request : CreateEvaluationRuleRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -149,27 +116,11 @@ def create(
         _response = self._client_wrapper.httpx_client.request(
             "api/public/unstable/evaluation-rules",
             method="POST",
-            json={
-                "name": name,
-                "evaluator": convert_and_respect_annotation_metadata(
-                    object_=evaluator,
-                    annotation=EvaluationRuleEvaluatorReference,
-                    direction="write",
-                ),
-                "target": target,
-                "enabled": enabled,
-                "sampling": sampling,
-                "filter": convert_and_respect_annotation_metadata(
-                    object_=filter,
-                    annotation=typing.Sequence[EvaluationRuleFilter],
-                    direction="write",
-                ),
-                "mapping": convert_and_respect_annotation_metadata(
-                    object_=mapping,
-                    annotation=typing.Sequence[EvaluationRuleMapping],
-                    direction="write",
-                ),
-            },
+            json=convert_and_respect_annotation_metadata(
+                object_=request,
+                annotation=CreateEvaluationRuleRequest,
+                direction="write",
+            ),
             request_options=request_options,
             omit=OMIT,
         )
@@ -734,18 +685,19 @@ def update(
         - switch to another evaluator
         - adjust sampling
         - change filters
-        - update variable mappings
+        - update LLM-as-judge variable mappings
 
         Important behavior:
         - provide only the fields you want to change
         - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving
-        - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration
-        - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration
+        - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically
         - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run
         - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409`
 
         Recovery guidance:
-        - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
+        - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
         - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter`
 
         Parameters
@@ -760,6 +712,7 @@ def update(
             Updated evaluator family.
 
             Langfuse resolves the provided evaluator family to its latest version before saving the rule.
+            A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule.
 
         target : typing.Optional[EvaluationRuleTarget]
             Updated target object type.
@@ -776,7 +729,9 @@ def update(
             For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
 
         mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]]
-            Updated variable mappings.
+            Updated LLM-as-judge variable mappings.
+
+            Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1169,13 +1124,7 @@ def __init__(self, *, client_wrapper: AsyncClientWrapper):
     async def create(
         self,
         *,
-        name: str,
-        evaluator: EvaluationRuleEvaluatorReference,
-        target: EvaluationRuleTarget,
-        enabled: bool,
-        mapping: typing.Sequence[EvaluationRuleMapping],
-        sampling: typing.Optional[float] = OMIT,
-        filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT,
+        request: CreateEvaluationRuleRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> AsyncHttpResponse[EvaluationRule]:
         """
@@ -1191,8 +1140,9 @@ async def create(
         - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints
         - Langfuse resolves that family to its latest version before saving the evaluation rule
         - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId`
-        - every evaluator prompt variable must be mapped exactly once
-        - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment`
+        - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once
+        - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests
+        - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment`
         - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run
         - at most 50 evaluation rules can be effectively active in one project at the same time
 
@@ -1209,44 +1159,15 @@ async def create(
         Recovery guidance:
         - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues`
         - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response
-        - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
+        - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping`
         - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable
-        - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name
+        - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name
         - `400 invalid_json_path`: remove or correct the `jsonPath`
         - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request.
 
         Parameters
         ----------
-        name : str
-            Human-readable deployment name.
-
-        evaluator : EvaluationRuleEvaluatorReference
-            Evaluator family to use.
-
-            Use `name` and `scope` from the evaluator endpoints.
-            Langfuse resolves that family to its latest version before saving the rule.
-
-        target : EvaluationRuleTarget
-            Target object type to evaluate.
-
-        enabled : bool
-            Whether the deployment should be active immediately after creation.
-
-        mapping : typing.Sequence[EvaluationRuleMapping]
-            Required variable mappings.
-
-            Every evaluator variable must appear exactly once.
-            Build this list from the evaluator `variables` array returned by the evaluator endpoints.
-
-        sampling : typing.Optional[float]
-            Optional sampling fraction. Defaults to `1`.
-
-        filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]]
-            Optional filter list.
-
-            Omit or pass an empty list to evaluate all matching targets for the selected `target`.
-            Each filter object must use a column that is valid for that `target`.
-            For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
+        request : CreateEvaluationRuleRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1258,27 +1179,11 @@ async def create(
         _response = await self._client_wrapper.httpx_client.request(
             "api/public/unstable/evaluation-rules",
             method="POST",
-            json={
-                "name": name,
-                "evaluator": convert_and_respect_annotation_metadata(
-                    object_=evaluator,
-                    annotation=EvaluationRuleEvaluatorReference,
-                    direction="write",
-                ),
-                "target": target,
-                "enabled": enabled,
-                "sampling": sampling,
-                "filter": convert_and_respect_annotation_metadata(
-                    object_=filter,
-                    annotation=typing.Sequence[EvaluationRuleFilter],
-                    direction="write",
-                ),
-                "mapping": convert_and_respect_annotation_metadata(
-                    object_=mapping,
-                    annotation=typing.Sequence[EvaluationRuleMapping],
-                    direction="write",
-                ),
-            },
+            json=convert_and_respect_annotation_metadata(
+                object_=request,
+                annotation=CreateEvaluationRuleRequest,
+                direction="write",
+            ),
             request_options=request_options,
             omit=OMIT,
         )
@@ -1843,18 +1748,19 @@ async def update(
         - switch to another evaluator
         - adjust sampling
         - change filters
-        - update variable mappings
+        - update LLM-as-judge variable mappings
 
         Important behavior:
         - provide only the fields you want to change
         - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving
-        - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration
-        - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration
+        - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target
+        - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically
         - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run
         - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409`
 
         Recovery guidance:
-        - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
+        - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping`
         - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter`
 
         Parameters
@@ -1869,6 +1775,7 @@ async def update(
             Updated evaluator family.
 
             Langfuse resolves the provided evaluator family to its latest version before saving the rule.
+            A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule.
 
         target : typing.Optional[EvaluationRuleTarget]
             Updated target object type.
@@ -1885,7 +1792,9 @@ async def update(
             For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
 
         mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]]
-            Updated variable mappings.
+            Updated LLM-as-judge variable mappings.
+
+            Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
diff --git a/langfuse/api/unstable/evaluation_rules/types/__init__.py b/langfuse/api/unstable/evaluation_rules/types/__init__.py
index 2854b1237..a1cdeb967 100644
--- a/langfuse/api/unstable/evaluation_rules/types/__init__.py
+++ b/langfuse/api/unstable/evaluation_rules/types/__init__.py
@@ -6,20 +6,36 @@
 from importlib import import_module
 
 if typing.TYPE_CHECKING:
+    from .code_evaluation_rule_evaluator_reference import (
+        CodeEvaluationRuleEvaluatorReference,
+    )
+    from .create_code_evaluation_rule_request import CreateCodeEvaluationRuleRequest
     from .create_evaluation_rule_request import CreateEvaluationRuleRequest
+    from .create_llm_as_judge_evaluation_rule_request import (
+        CreateLlmAsJudgeEvaluationRuleRequest,
+    )
     from .delete_evaluation_rule_response import DeleteEvaluationRuleResponse
     from .evaluation_rule import EvaluationRule
     from .evaluation_rule_evaluator import EvaluationRuleEvaluator
     from .evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference
     from .evaluation_rules import EvaluationRules
+    from .llm_as_judge_evaluation_rule_evaluator_reference import (
+        LlmAsJudgeEvaluationRuleEvaluatorReference,
+    )
+    from .llm_as_judge_evaluator_type import LlmAsJudgeEvaluatorType
     from .update_evaluation_rule_request import UpdateEvaluationRuleRequest
 _dynamic_imports: typing.Dict[str, str] = {
+    "CodeEvaluationRuleEvaluatorReference": ".code_evaluation_rule_evaluator_reference",
+    "CreateCodeEvaluationRuleRequest": ".create_code_evaluation_rule_request",
     "CreateEvaluationRuleRequest": ".create_evaluation_rule_request",
+    "CreateLlmAsJudgeEvaluationRuleRequest": ".create_llm_as_judge_evaluation_rule_request",
     "DeleteEvaluationRuleResponse": ".delete_evaluation_rule_response",
     "EvaluationRule": ".evaluation_rule",
     "EvaluationRuleEvaluator": ".evaluation_rule_evaluator",
     "EvaluationRuleEvaluatorReference": ".evaluation_rule_evaluator_reference",
     "EvaluationRules": ".evaluation_rules",
+    "LlmAsJudgeEvaluationRuleEvaluatorReference": ".llm_as_judge_evaluation_rule_evaluator_reference",
+    "LlmAsJudgeEvaluatorType": ".llm_as_judge_evaluator_type",
     "UpdateEvaluationRuleRequest": ".update_evaluation_rule_request",
 }
 
@@ -52,11 +68,16 @@ def __dir__():
 
 
 __all__ = [
+    "CodeEvaluationRuleEvaluatorReference",
+    "CreateCodeEvaluationRuleRequest",
     "CreateEvaluationRuleRequest",
+    "CreateLlmAsJudgeEvaluationRuleRequest",
     "DeleteEvaluationRuleResponse",
     "EvaluationRule",
     "EvaluationRuleEvaluator",
     "EvaluationRuleEvaluatorReference",
     "EvaluationRules",
+    "LlmAsJudgeEvaluationRuleEvaluatorReference",
+    "LlmAsJudgeEvaluatorType",
     "UpdateEvaluationRuleRequest",
 ]
diff --git a/langfuse/api/unstable/evaluation_rules/types/code_evaluation_rule_evaluator_reference.py b/langfuse/api/unstable/evaluation_rules/types/code_evaluation_rule_evaluator_reference.py
new file mode 100644
index 000000000..1c259bab8
--- /dev/null
+++ b/langfuse/api/unstable/evaluation_rules/types/code_evaluation_rule_evaluator_reference.py
@@ -0,0 +1,32 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ....core.pydantic_utilities import UniversalBaseModel
+from ...commons.types.evaluator_scope import EvaluatorScope
+
+
+class CodeEvaluationRuleEvaluatorReference(UniversalBaseModel):
+    """
+    Code evaluator family reference used when creating an evaluation rule.
+    """
+
+    name: str = pydantic.Field()
+    """
+    Evaluator family name.
+    """
+
+    scope: EvaluatorScope = pydantic.Field()
+    """
+    Whether the evaluator family is project-owned or Langfuse-managed.
+    """
+
+    type: typing.Literal["code"] = pydantic.Field(default="code")
+    """
+    Must be `code`.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluation_rules/types/create_code_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/create_code_evaluation_rule_request.py
new file mode 100644
index 000000000..08df1f78a
--- /dev/null
+++ b/langfuse/api/unstable/evaluation_rules/types/create_code_evaluation_rule_request.py
@@ -0,0 +1,56 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ....core.pydantic_utilities import UniversalBaseModel
+from ...commons.types.evaluation_rule_filter import EvaluationRuleFilter
+from ...commons.types.evaluation_rule_target import EvaluationRuleTarget
+from .code_evaluation_rule_evaluator_reference import (
+    CodeEvaluationRuleEvaluatorReference,
+)
+
+
+class CreateCodeEvaluationRuleRequest(UniversalBaseModel):
+    name: str = pydantic.Field()
+    """
+    Human-readable deployment name.
+    """
+
+    evaluator: CodeEvaluationRuleEvaluatorReference = pydantic.Field()
+    """
+    Code evaluator family to use.
+    
+    Use `name`, `scope`, and `type` from the evaluator endpoints.
+    Langfuse resolves that family to its latest version before saving the rule.
+    """
+
+    target: EvaluationRuleTarget = pydantic.Field()
+    """
+    Target object type to evaluate.
+    """
+
+    enabled: bool = pydantic.Field()
+    """
+    Whether the deployment should be active immediately after creation.
+    """
+
+    sampling: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    Optional sampling fraction. Defaults to `1`.
+    """
+
+    filter: typing.Optional[typing.List[EvaluationRuleFilter]] = pydantic.Field(
+        default=None
+    )
+    """
+    Optional filter list.
+    
+    Omit or pass an empty list to evaluate all matching targets for the selected `target`.
+    Each filter object must use a column that is valid for that `target`.
+    For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py
index 9a90b227a..a6504934d 100644
--- a/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py
+++ b/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py
@@ -2,74 +2,11 @@
 
 import typing
 
-import pydantic
-from ....core.pydantic_utilities import UniversalBaseModel
-from ...commons.types.evaluation_rule_filter import EvaluationRuleFilter
-from ...commons.types.evaluation_rule_mapping import EvaluationRuleMapping
-from ...commons.types.evaluation_rule_target import EvaluationRuleTarget
-from .evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference
-
-
-class CreateEvaluationRuleRequest(UniversalBaseModel):
-    """
-    Request body for creating an evaluation rule.
-
-    Checklist for agents and SDK clients:
-    - reference an existing evaluator family by `evaluator.name` and `evaluator.scope`
-    - choose `target=observation` or `target=experiment`
-    - if `target=experiment` and you want a dataset filter, call `GET /api/public/v2/datasets` first and use dataset `id` values in `filter[].value`
-    - fetch or inspect the evaluator first, then provide a complete variable mapping for every evaluator variable listed in `variables`
-    - optionally narrow execution with `filter`
-    - set `enabled=true` only when you want live execution immediately
-    """
-
-    name: str = pydantic.Field()
-    """
-    Human-readable deployment name.
-    """
-
-    evaluator: EvaluationRuleEvaluatorReference = pydantic.Field()
-    """
-    Evaluator family to use.
-    
-    Use `name` and `scope` from the evaluator endpoints.
-    Langfuse resolves that family to its latest version before saving the rule.
-    """
-
-    target: EvaluationRuleTarget = pydantic.Field()
-    """
-    Target object type to evaluate.
-    """
-
-    enabled: bool = pydantic.Field()
-    """
-    Whether the deployment should be active immediately after creation.
-    """
-
-    sampling: typing.Optional[float] = pydantic.Field(default=None)
-    """
-    Optional sampling fraction. Defaults to `1`.
-    """
-
-    filter: typing.Optional[typing.List[EvaluationRuleFilter]] = pydantic.Field(
-        default=None
-    )
-    """
-    Optional filter list.
-    
-    Omit or pass an empty list to evaluate all matching targets for the selected `target`.
-    Each filter object must use a column that is valid for that `target`.
-    For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
-    """
-
-    mapping: typing.List[EvaluationRuleMapping] = pydantic.Field()
-    """
-    Required variable mappings.
-    
-    Every evaluator variable must appear exactly once.
-    Build this list from the evaluator `variables` array returned by the evaluator endpoints.
-    """
-
-    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
-        extra="allow", frozen=True
-    )
+from .create_code_evaluation_rule_request import CreateCodeEvaluationRuleRequest
+from .create_llm_as_judge_evaluation_rule_request import (
+    CreateLlmAsJudgeEvaluationRuleRequest,
+)
+
+CreateEvaluationRuleRequest = typing.Union[
+    CreateLlmAsJudgeEvaluationRuleRequest, CreateCodeEvaluationRuleRequest
+]
diff --git a/langfuse/api/unstable/evaluation_rules/types/create_llm_as_judge_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/create_llm_as_judge_evaluation_rule_request.py
new file mode 100644
index 000000000..b511b4353
--- /dev/null
+++ b/langfuse/api/unstable/evaluation_rules/types/create_llm_as_judge_evaluation_rule_request.py
@@ -0,0 +1,65 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ....core.pydantic_utilities import UniversalBaseModel
+from ...commons.types.evaluation_rule_filter import EvaluationRuleFilter
+from ...commons.types.evaluation_rule_mapping import EvaluationRuleMapping
+from ...commons.types.evaluation_rule_target import EvaluationRuleTarget
+from .llm_as_judge_evaluation_rule_evaluator_reference import (
+    LlmAsJudgeEvaluationRuleEvaluatorReference,
+)
+
+
+class CreateLlmAsJudgeEvaluationRuleRequest(UniversalBaseModel):
+    name: str = pydantic.Field()
+    """
+    Human-readable deployment name.
+    """
+
+    evaluator: LlmAsJudgeEvaluationRuleEvaluatorReference = pydantic.Field()
+    """
+    LLM-as-judge evaluator family to use.
+    
+    Use `name`, `scope`, and `type` from the evaluator endpoints. If `type` is omitted, Langfuse defaults it to `llm_as_judge` for backwards compatibility.
+    Langfuse resolves that family to its latest version before saving the rule.
+    """
+
+    target: EvaluationRuleTarget = pydantic.Field()
+    """
+    Target object type to evaluate.
+    """
+
+    enabled: bool = pydantic.Field()
+    """
+    Whether the deployment should be active immediately after creation.
+    """
+
+    sampling: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    Optional sampling fraction. Defaults to `1`.
+    """
+
+    filter: typing.Optional[typing.List[EvaluationRuleFilter]] = pydantic.Field(
+        default=None
+    )
+    """
+    Optional filter list.
+    
+    Omit or pass an empty list to evaluate all matching targets for the selected `target`.
+    Each filter object must use a column that is valid for that `target`.
+    For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names.
+    """
+
+    mapping: typing.List[EvaluationRuleMapping] = pydantic.Field()
+    """
+    LLM-as-judge variable mappings.
+    
+    Every evaluator variable must appear exactly once.
+    Build this list from the evaluator `variables` array returned by the evaluator endpoints.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py
index d8baee407..418004090 100644
--- a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py
+++ b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py
@@ -42,6 +42,7 @@ class EvaluationRule(UniversalBaseModel):
         EvaluationRuleStatus,
         EvaluationRuleTarget,
         EvaluatorScope,
+        EvaluatorType,
     )
     from langfuse.unstable.evaluation_rules import (
         EvaluationRule,
@@ -55,6 +56,7 @@ class EvaluationRule(UniversalBaseModel):
             id="evaltmpl_123",
             name="answer-correctness",
             scope=EvaluatorScope.PROJECT,
+            type=EvaluatorType.LLM_AS_JUDGE,
         ),
         target=EvaluationRuleTarget.OBSERVATION,
         enabled=True,
@@ -150,7 +152,7 @@ class EvaluationRule(UniversalBaseModel):
 
     mapping: typing.List[EvaluationRuleMapping] = pydantic.Field()
     """
-    Variable mappings used to populate the evaluator prompt from the live target object.
+    Variable mappings used to populate evaluator runtime variables from the live target object.
     """
 
     created_at: typing_extensions.Annotated[
diff --git a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py
index 9d1be79de..c27497c9d 100644
--- a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py
+++ b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py
@@ -5,6 +5,7 @@
 import pydantic
 from ....core.pydantic_utilities import UniversalBaseModel
 from ...commons.types.evaluator_scope import EvaluatorScope
+from ...commons.types.evaluator_type import EvaluatorType
 
 
 class EvaluationRuleEvaluator(UniversalBaseModel):
@@ -12,7 +13,7 @@ class EvaluationRuleEvaluator(UniversalBaseModel):
     Resolved evaluator currently used by the evaluation rule.
 
     `id` is the exact active evaluator version.
-    `name` and `scope` identify the evaluator family conceptually.
+    `name`, `scope`, and `type` identify the evaluator family conceptually.
     """
 
     id: str = pydantic.Field()
@@ -30,6 +31,11 @@ class EvaluationRuleEvaluator(UniversalBaseModel):
     Whether the evaluator family is project-owned or Langfuse-managed.
     """
 
+    type: EvaluatorType = pydantic.Field()
+    """
+    Evaluator engine type.
+    """
+
     model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
         extra="allow", frozen=True
     )
diff --git a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py
index 25253182f..a2a38723d 100644
--- a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py
+++ b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py
@@ -9,9 +9,10 @@
 
 class EvaluationRuleEvaluatorReference(UniversalBaseModel):
     """
-    Evaluator family reference used when creating or updating an evaluation rule.
+    Evaluator family reference used when updating an evaluation rule.
 
-    `name` and `scope` are enough to identify the evaluator family in the authenticated project context.
+    `name` and `scope` identify the evaluator family in the authenticated project context.
+    A rule's evaluator type cannot be changed, so this reference does not accept a `type`; the family must match the rule's current evaluator type.
     """
 
     name: str = pydantic.Field()
diff --git a/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluation_rule_evaluator_reference.py b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluation_rule_evaluator_reference.py
new file mode 100644
index 000000000..ca57fe517
--- /dev/null
+++ b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluation_rule_evaluator_reference.py
@@ -0,0 +1,33 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+from ....core.pydantic_utilities import UniversalBaseModel
+from ...commons.types.evaluator_scope import EvaluatorScope
+from .llm_as_judge_evaluator_type import LlmAsJudgeEvaluatorType
+
+
+class LlmAsJudgeEvaluationRuleEvaluatorReference(UniversalBaseModel):
+    """
+    LLM-as-judge evaluator family reference used when creating an evaluation rule.
+    """
+
+    name: str = pydantic.Field()
+    """
+    Evaluator family name.
+    """
+
+    scope: EvaluatorScope = pydantic.Field()
+    """
+    Whether the evaluator family is project-owned or Langfuse-managed.
+    """
+
+    type: typing.Optional[LlmAsJudgeEvaluatorType] = pydantic.Field(default=None)
+    """
+    Evaluator engine type. Defaults to `llm_as_judge` when omitted.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluator_type.py b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluator_type.py
new file mode 100644
index 000000000..b18856d22
--- /dev/null
+++ b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluator_type.py
@@ -0,0 +1,15 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+from ....core import enum
+
+T_Result = typing.TypeVar("T_Result")
+
+
+class LlmAsJudgeEvaluatorType(enum.StrEnum):
+    LLM_AS_JUDGE = "llm_as_judge"
+
+    def visit(self, llm_as_judge: typing.Callable[[], T_Result]) -> T_Result:
+        if self is LlmAsJudgeEvaluatorType.LLM_AS_JUDGE:
+            return llm_as_judge()
diff --git a/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py
index 51e2d9288..40e5043a6 100644
--- a/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py
+++ b/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py
@@ -19,8 +19,9 @@ class UpdateEvaluationRuleRequest(UniversalBaseModel):
 
     Practical guidance:
     - If you only want to rename the rule or change sampling, send just those fields.
-    - If you change `evaluator`, send a fresh `mapping` unless you are certain the existing mapping still matches the evaluator variables.
-    - If you change `target`, usually send both `filter` and `mapping` in the same request.
+    - If you change to an LLM-as-judge `evaluator`, send a fresh `mapping` unless you are certain the existing mapping still matches the evaluator variables.
+    - If you change `target` for an LLM-as-judge rule, usually send both `filter` and `mapping` in the same request.
+    - For code evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically.
     - If you change an experiment `datasetId` filter, call `GET /api/public/v2/datasets` and use dataset `id` values from that response.
     """
 
@@ -36,6 +37,7 @@ class UpdateEvaluationRuleRequest(UniversalBaseModel):
     Updated evaluator family.
     
     Langfuse resolves the provided evaluator family to its latest version before saving the rule.
+    A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule.
     """
 
     target: typing.Optional[EvaluationRuleTarget] = pydantic.Field(default=None)
@@ -66,7 +68,9 @@ class UpdateEvaluationRuleRequest(UniversalBaseModel):
         default=None
     )
     """
-    Updated variable mappings.
+    Updated LLM-as-judge variable mappings.
+    
+    Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response.
     """
 
     model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
diff --git a/langfuse/api/unstable/evaluators/__init__.py b/langfuse/api/unstable/evaluators/__init__.py
index 942109740..20a72ef82 100644
--- a/langfuse/api/unstable/evaluators/__init__.py
+++ b/langfuse/api/unstable/evaluators/__init__.py
@@ -6,11 +6,33 @@
 from importlib import import_module
 
 if typing.TYPE_CHECKING:
-    from .types import CreateEvaluatorRequest, Evaluator, Evaluators
+    from .types import (
+        CodeEvaluator,
+        CreateCodeEvaluatorRequest,
+        CreateEvaluatorRequest,
+        CreateEvaluatorRequest_Code,
+        CreateEvaluatorRequest_LlmAsJudge,
+        CreateLlmAsJudgeEvaluatorRequest,
+        Evaluator,
+        EvaluatorBase,
+        Evaluator_Code,
+        Evaluator_LlmAsJudge,
+        Evaluators,
+        LlmAsJudgeEvaluator,
+    )
 _dynamic_imports: typing.Dict[str, str] = {
+    "CodeEvaluator": ".types",
+    "CreateCodeEvaluatorRequest": ".types",
     "CreateEvaluatorRequest": ".types",
+    "CreateEvaluatorRequest_Code": ".types",
+    "CreateEvaluatorRequest_LlmAsJudge": ".types",
+    "CreateLlmAsJudgeEvaluatorRequest": ".types",
     "Evaluator": ".types",
+    "EvaluatorBase": ".types",
+    "Evaluator_Code": ".types",
+    "Evaluator_LlmAsJudge": ".types",
     "Evaluators": ".types",
+    "LlmAsJudgeEvaluator": ".types",
 }
 
 
@@ -41,4 +63,17 @@ def __dir__():
     return sorted(lazy_attrs)
 
 
-__all__ = ["CreateEvaluatorRequest", "Evaluator", "Evaluators"]
+__all__ = [
+    "CodeEvaluator",
+    "CreateCodeEvaluatorRequest",
+    "CreateEvaluatorRequest",
+    "CreateEvaluatorRequest_Code",
+    "CreateEvaluatorRequest_LlmAsJudge",
+    "CreateLlmAsJudgeEvaluatorRequest",
+    "Evaluator",
+    "EvaluatorBase",
+    "Evaluator_Code",
+    "Evaluator_LlmAsJudge",
+    "Evaluators",
+    "LlmAsJudgeEvaluator",
+]
diff --git a/langfuse/api/unstable/evaluators/client.py b/langfuse/api/unstable/evaluators/client.py
index b7f25532a..ac63e2da9 100644
--- a/langfuse/api/unstable/evaluators/client.py
+++ b/langfuse/api/unstable/evaluators/client.py
@@ -4,9 +4,8 @@
 
 from ...core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
 from ...core.request_options import RequestOptions
-from ..commons.types.evaluator_model_config import EvaluatorModelConfig
-from ..commons.types.evaluator_output_definition import EvaluatorOutputDefinition
 from .raw_client import AsyncRawEvaluatorsClient, RawEvaluatorsClient
+from .types.create_evaluator_request import CreateEvaluatorRequest
 from .types.evaluator import Evaluator
 from .types.evaluators import Evaluators
 
@@ -32,16 +31,15 @@ def with_raw_response(self) -> RawEvaluatorsClient:
     def create(
         self,
         *,
-        name: str,
-        prompt: str,
-        output_definition: EvaluatorOutputDefinition,
-        model_config: typing.Optional[EvaluatorModelConfig] = OMIT,
+        request: CreateEvaluatorRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> Evaluator:
         """
         Create an evaluator in the authenticated project.
 
-        Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration.
+        Use evaluators to define **how** Langfuse should score data.
+        LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration.
+        Code evaluators define source code and a runtime language.
 
         Naming behavior:
         - If this is a new evaluator name in your project, Langfuse creates version `1`.
@@ -54,30 +52,22 @@ def create(
         3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical.
         4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`.
 
+        Code evaluator validation:
+        - At creation, Langfuse only validates the request shape
+        - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation.
+
         Recovery guidance:
         - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request.
         - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry.
-        - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly.
 
         Unstable API note:
         - This surface may evolve while the underlying evaluation data model is being redesigned.
 
         Parameters
         ----------
-        name : str
-            Evaluator name within the authenticated project.
-
-        prompt : str
-            Prompt template used by the evaluator.
-
-        output_definition : EvaluatorOutputDefinition
-            Structured output schema the evaluator must return.
-
-            Always send `dataType`.
-            Do not send `version`; it is an internal storage detail and not part of the public request contract.
-
-        model_config : typing.Optional[EvaluatorModelConfig]
-            Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model.
+        request : CreateEvaluatorRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -95,6 +85,7 @@ def create(
             EvaluatorOutputDefinition_Numeric,
             EvaluatorOutputFieldDefinition,
         )
+        from langfuse.unstable.evaluators import CreateEvaluatorRequest_LlmAsJudge
 
         client = LangfuseAPI(
             x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME",
@@ -105,29 +96,27 @@ def create(
             base_url="https://yourhost.com/path/to/api",
         )
         client.unstable.evaluators.create(
-            name="answer-correctness",
-            prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n",
-            output_definition=EvaluatorOutputDefinition_Numeric(
-                data_type=EvaluatorOutputDataType.NUMERIC,
-                reasoning=EvaluatorOutputFieldDefinition(
-                    description="Explain why the score was assigned.",
+            request=CreateEvaluatorRequest_LlmAsJudge(
+                name="answer-correctness",
+                prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n",
+                output_definition=EvaluatorOutputDefinition_Numeric(
+                    data_type=EvaluatorOutputDataType.NUMERIC,
+                    reasoning=EvaluatorOutputFieldDefinition(
+                        description="Explain why the score was assigned.",
+                    ),
+                    score=EvaluatorOutputFieldDefinition(
+                        description="Correctness score between 0 and 1.",
+                    ),
                 ),
-                score=EvaluatorOutputFieldDefinition(
-                    description="Correctness score between 0 and 1.",
+                model_config=EvaluatorModelConfig(
+                    provider="openai",
+                    model="gpt-4.1-mini",
                 ),
             ),
-            model_config=EvaluatorModelConfig(
-                provider="openai",
-                model="gpt-4.1-mini",
-            ),
         )
         """
         _response = self._raw_client.create(
-            name=name,
-            prompt=prompt,
-            output_definition=output_definition,
-            model_config=model_config,
-            request_options=request_options,
+            request=request, request_options=request_options
         )
         return _response.data
 
@@ -241,16 +230,15 @@ def with_raw_response(self) -> AsyncRawEvaluatorsClient:
     async def create(
         self,
         *,
-        name: str,
-        prompt: str,
-        output_definition: EvaluatorOutputDefinition,
-        model_config: typing.Optional[EvaluatorModelConfig] = OMIT,
+        request: CreateEvaluatorRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> Evaluator:
         """
         Create an evaluator in the authenticated project.
 
-        Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration.
+        Use evaluators to define **how** Langfuse should score data.
+        LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration.
+        Code evaluators define source code and a runtime language.
 
         Naming behavior:
         - If this is a new evaluator name in your project, Langfuse creates version `1`.
@@ -263,30 +251,22 @@ async def create(
         3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical.
         4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`.
 
+        Code evaluator validation:
+        - At creation, Langfuse only validates the request shape
+        - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation.
+
         Recovery guidance:
         - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request.
         - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry.
-        - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly.
 
         Unstable API note:
         - This surface may evolve while the underlying evaluation data model is being redesigned.
 
         Parameters
         ----------
-        name : str
-            Evaluator name within the authenticated project.
-
-        prompt : str
-            Prompt template used by the evaluator.
-
-        output_definition : EvaluatorOutputDefinition
-            Structured output schema the evaluator must return.
-
-            Always send `dataType`.
-            Do not send `version`; it is an internal storage detail and not part of the public request contract.
-
-        model_config : typing.Optional[EvaluatorModelConfig]
-            Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model.
+        request : CreateEvaluatorRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -306,6 +286,7 @@ async def create(
             EvaluatorOutputDefinition_Numeric,
             EvaluatorOutputFieldDefinition,
         )
+        from langfuse.unstable.evaluators import CreateEvaluatorRequest_LlmAsJudge
 
         client = AsyncLangfuseAPI(
             x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME",
@@ -319,32 +300,30 @@ async def create(
 
         async def main() -> None:
             await client.unstable.evaluators.create(
-                name="answer-correctness",
-                prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n",
-                output_definition=EvaluatorOutputDefinition_Numeric(
-                    data_type=EvaluatorOutputDataType.NUMERIC,
-                    reasoning=EvaluatorOutputFieldDefinition(
-                        description="Explain why the score was assigned.",
+                request=CreateEvaluatorRequest_LlmAsJudge(
+                    name="answer-correctness",
+                    prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n",
+                    output_definition=EvaluatorOutputDefinition_Numeric(
+                        data_type=EvaluatorOutputDataType.NUMERIC,
+                        reasoning=EvaluatorOutputFieldDefinition(
+                            description="Explain why the score was assigned.",
+                        ),
+                        score=EvaluatorOutputFieldDefinition(
+                            description="Correctness score between 0 and 1.",
+                        ),
                     ),
-                    score=EvaluatorOutputFieldDefinition(
-                        description="Correctness score between 0 and 1.",
+                    model_config=EvaluatorModelConfig(
+                        provider="openai",
+                        model="gpt-4.1-mini",
                     ),
                 ),
-                model_config=EvaluatorModelConfig(
-                    provider="openai",
-                    model="gpt-4.1-mini",
-                ),
             )
 
 
         asyncio.run(main())
         """
         _response = await self._raw_client.create(
-            name=name,
-            prompt=prompt,
-            output_definition=output_definition,
-            model_config=model_config,
-            request_options=request_options,
+            request=request, request_options=request_options
         )
         return _response.data
 
diff --git a/langfuse/api/unstable/evaluators/raw_client.py b/langfuse/api/unstable/evaluators/raw_client.py
index f599e3298..30034d033 100644
--- a/langfuse/api/unstable/evaluators/raw_client.py
+++ b/langfuse/api/unstable/evaluators/raw_client.py
@@ -23,8 +23,6 @@
 from ...core.pydantic_utilities import parse_obj_as
 from ...core.request_options import RequestOptions
 from ...core.serialization import convert_and_respect_annotation_metadata
-from ..commons.types.evaluator_model_config import EvaluatorModelConfig
-from ..commons.types.evaluator_output_definition import EvaluatorOutputDefinition
 from ..errors.errors.access_denied_error import (
     AccessDeniedError as unstable_errors_errors_access_denied_error_AccessDeniedError,
 )
@@ -43,6 +41,7 @@
 )
 from ..errors.errors.unprocessable_content_error import UnprocessableContentError
 from ..errors.types.public_api_error import PublicApiError
+from .types.create_evaluator_request import CreateEvaluatorRequest
 from .types.evaluator import Evaluator
 from .types.evaluators import Evaluators
 
@@ -57,16 +56,15 @@ def __init__(self, *, client_wrapper: SyncClientWrapper):
     def create(
         self,
         *,
-        name: str,
-        prompt: str,
-        output_definition: EvaluatorOutputDefinition,
-        model_config: typing.Optional[EvaluatorModelConfig] = OMIT,
+        request: CreateEvaluatorRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> HttpResponse[Evaluator]:
         """
         Create an evaluator in the authenticated project.
 
-        Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration.
+        Use evaluators to define **how** Langfuse should score data.
+        LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration.
+        Code evaluators define source code and a runtime language.
 
         Naming behavior:
         - If this is a new evaluator name in your project, Langfuse creates version `1`.
@@ -79,30 +77,22 @@ def create(
         3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical.
         4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`.
 
+        Code evaluator validation:
+        - At creation, Langfuse only validates the request shape
+        - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation.
+
         Recovery guidance:
         - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request.
         - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry.
-        - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly.
 
         Unstable API note:
         - This surface may evolve while the underlying evaluation data model is being redesigned.
 
         Parameters
         ----------
-        name : str
-            Evaluator name within the authenticated project.
-
-        prompt : str
-            Prompt template used by the evaluator.
-
-        output_definition : EvaluatorOutputDefinition
-            Structured output schema the evaluator must return.
-
-            Always send `dataType`.
-            Do not send `version`; it is an internal storage detail and not part of the public request contract.
-
-        model_config : typing.Optional[EvaluatorModelConfig]
-            Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model.
+        request : CreateEvaluatorRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -114,20 +104,9 @@ def create(
         _response = self._client_wrapper.httpx_client.request(
             "api/public/unstable/evaluators",
             method="POST",
-            json={
-                "name": name,
-                "prompt": prompt,
-                "outputDefinition": convert_and_respect_annotation_metadata(
-                    object_=output_definition,
-                    annotation=EvaluatorOutputDefinition,
-                    direction="write",
-                ),
-                "modelConfig": convert_and_respect_annotation_metadata(
-                    object_=model_config,
-                    annotation=typing.Optional[EvaluatorModelConfig],
-                    direction="write",
-                ),
-            },
+            json=convert_and_respect_annotation_metadata(
+                object_=request, annotation=CreateEvaluatorRequest, direction="write"
+            ),
             request_options=request_options,
             omit=OMIT,
         )
@@ -671,16 +650,15 @@ def __init__(self, *, client_wrapper: AsyncClientWrapper):
     async def create(
         self,
         *,
-        name: str,
-        prompt: str,
-        output_definition: EvaluatorOutputDefinition,
-        model_config: typing.Optional[EvaluatorModelConfig] = OMIT,
+        request: CreateEvaluatorRequest,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> AsyncHttpResponse[Evaluator]:
         """
         Create an evaluator in the authenticated project.
 
-        Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration.
+        Use evaluators to define **how** Langfuse should score data.
+        LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration.
+        Code evaluators define source code and a runtime language.
 
         Naming behavior:
         - If this is a new evaluator name in your project, Langfuse creates version `1`.
@@ -693,30 +671,22 @@ async def create(
         3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical.
         4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`.
 
+        Code evaluator validation:
+        - At creation, Langfuse only validates the request shape
+        - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation.
+
         Recovery guidance:
         - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request.
         - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry.
-        - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape.
+        - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly.
 
         Unstable API note:
         - This surface may evolve while the underlying evaluation data model is being redesigned.
 
         Parameters
         ----------
-        name : str
-            Evaluator name within the authenticated project.
-
-        prompt : str
-            Prompt template used by the evaluator.
-
-        output_definition : EvaluatorOutputDefinition
-            Structured output schema the evaluator must return.
-
-            Always send `dataType`.
-            Do not send `version`; it is an internal storage detail and not part of the public request contract.
-
-        model_config : typing.Optional[EvaluatorModelConfig]
-            Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model.
+        request : CreateEvaluatorRequest
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -728,20 +698,9 @@ async def create(
         _response = await self._client_wrapper.httpx_client.request(
             "api/public/unstable/evaluators",
             method="POST",
-            json={
-                "name": name,
-                "prompt": prompt,
-                "outputDefinition": convert_and_respect_annotation_metadata(
-                    object_=output_definition,
-                    annotation=EvaluatorOutputDefinition,
-                    direction="write",
-                ),
-                "modelConfig": convert_and_respect_annotation_metadata(
-                    object_=model_config,
-                    annotation=typing.Optional[EvaluatorModelConfig],
-                    direction="write",
-                ),
-            },
+            json=convert_and_respect_annotation_metadata(
+                object_=request, annotation=CreateEvaluatorRequest, direction="write"
+            ),
             request_options=request_options,
             omit=OMIT,
         )
diff --git a/langfuse/api/unstable/evaluators/types/__init__.py b/langfuse/api/unstable/evaluators/types/__init__.py
index 6e7a13233..650598592 100644
--- a/langfuse/api/unstable/evaluators/types/__init__.py
+++ b/langfuse/api/unstable/evaluators/types/__init__.py
@@ -6,13 +6,31 @@
 from importlib import import_module
 
 if typing.TYPE_CHECKING:
-    from .create_evaluator_request import CreateEvaluatorRequest
-    from .evaluator import Evaluator
+    from .code_evaluator import CodeEvaluator
+    from .create_code_evaluator_request import CreateCodeEvaluatorRequest
+    from .create_evaluator_request import (
+        CreateEvaluatorRequest,
+        CreateEvaluatorRequest_Code,
+        CreateEvaluatorRequest_LlmAsJudge,
+    )
+    from .create_llm_as_judge_evaluator_request import CreateLlmAsJudgeEvaluatorRequest
+    from .evaluator import Evaluator, Evaluator_Code, Evaluator_LlmAsJudge
+    from .evaluator_base import EvaluatorBase
     from .evaluators import Evaluators
+    from .llm_as_judge_evaluator import LlmAsJudgeEvaluator
 _dynamic_imports: typing.Dict[str, str] = {
+    "CodeEvaluator": ".code_evaluator",
+    "CreateCodeEvaluatorRequest": ".create_code_evaluator_request",
     "CreateEvaluatorRequest": ".create_evaluator_request",
+    "CreateEvaluatorRequest_Code": ".create_evaluator_request",
+    "CreateEvaluatorRequest_LlmAsJudge": ".create_evaluator_request",
+    "CreateLlmAsJudgeEvaluatorRequest": ".create_llm_as_judge_evaluator_request",
     "Evaluator": ".evaluator",
+    "EvaluatorBase": ".evaluator_base",
+    "Evaluator_Code": ".evaluator",
+    "Evaluator_LlmAsJudge": ".evaluator",
     "Evaluators": ".evaluators",
+    "LlmAsJudgeEvaluator": ".llm_as_judge_evaluator",
 }
 
 
@@ -43,4 +61,17 @@ def __dir__():
     return sorted(lazy_attrs)
 
 
-__all__ = ["CreateEvaluatorRequest", "Evaluator", "Evaluators"]
+__all__ = [
+    "CodeEvaluator",
+    "CreateCodeEvaluatorRequest",
+    "CreateEvaluatorRequest",
+    "CreateEvaluatorRequest_Code",
+    "CreateEvaluatorRequest_LlmAsJudge",
+    "CreateLlmAsJudgeEvaluatorRequest",
+    "Evaluator",
+    "EvaluatorBase",
+    "Evaluator_Code",
+    "Evaluator_LlmAsJudge",
+    "Evaluators",
+    "LlmAsJudgeEvaluator",
+]
diff --git a/langfuse/api/unstable/evaluators/types/code_evaluator.py b/langfuse/api/unstable/evaluators/types/code_evaluator.py
new file mode 100644
index 000000000..f8648603d
--- /dev/null
+++ b/langfuse/api/unstable/evaluators/types/code_evaluator.py
@@ -0,0 +1,31 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+import typing_extensions
+from ....core.serialization import FieldMetadata
+from ...commons.types.code_evaluator_source_code_language import (
+    CodeEvaluatorSourceCodeLanguage,
+)
+from .evaluator_base import EvaluatorBase
+
+
+class CodeEvaluator(EvaluatorBase):
+    source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")] = (
+        pydantic.Field()
+    )
+    """
+    Source code executed for each matched observation.
+    """
+
+    source_code_language: typing_extensions.Annotated[
+        CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage")
+    ] = pydantic.Field()
+    """
+    Runtime language for `sourceCode`.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluators/types/create_code_evaluator_request.py b/langfuse/api/unstable/evaluators/types/create_code_evaluator_request.py
new file mode 100644
index 000000000..860c15f9a
--- /dev/null
+++ b/langfuse/api/unstable/evaluators/types/create_code_evaluator_request.py
@@ -0,0 +1,36 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+import typing_extensions
+from ....core.pydantic_utilities import UniversalBaseModel
+from ....core.serialization import FieldMetadata
+from ...commons.types.code_evaluator_source_code_language import (
+    CodeEvaluatorSourceCodeLanguage,
+)
+
+
+class CreateCodeEvaluatorRequest(UniversalBaseModel):
+    name: str = pydantic.Field()
+    """
+    Evaluator name within the authenticated project.
+    """
+
+    source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")] = (
+        pydantic.Field()
+    )
+    """
+    Code executed for each matched observation.
+    """
+
+    source_code_language: typing_extensions.Annotated[
+        CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage")
+    ] = pydantic.Field()
+    """
+    Runtime language for `sourceCode`.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluators/types/create_evaluator_request.py b/langfuse/api/unstable/evaluators/types/create_evaluator_request.py
index 7616d99ee..a866aa4c5 100644
--- a/langfuse/api/unstable/evaluators/types/create_evaluator_request.py
+++ b/langfuse/api/unstable/evaluators/types/create_evaluator_request.py
@@ -1,50 +1,66 @@
 # This file was auto-generated by Fern from our API Definition.
 
+from __future__ import annotations
+
 import typing
 
 import pydantic
 import typing_extensions
 from ....core.pydantic_utilities import UniversalBaseModel
 from ....core.serialization import FieldMetadata
+from ...commons.types.code_evaluator_source_code_language import (
+    CodeEvaluatorSourceCodeLanguage,
+)
 from ...commons.types.evaluator_model_config import EvaluatorModelConfig
 from ...commons.types.evaluator_output_definition import EvaluatorOutputDefinition
 
 
-class CreateEvaluatorRequest(UniversalBaseModel):
+class CreateEvaluatorRequest_LlmAsJudge(UniversalBaseModel):
     """
     Request body for creating an evaluator.
 
     If the same `name` already exists in your project, Langfuse creates the next version and returns it.
     Existing evaluation rules in the same project are then moved to that new latest version automatically.
+    If `type` is omitted, Langfuse defaults it to `llm_as_judge` for backwards compatibility.
     """
 
-    name: str = pydantic.Field()
-    """
-    Evaluator name within the authenticated project.
-    """
-
-    prompt: str = pydantic.Field()
-    """
-    Prompt template used by the evaluator.
-    """
-
+    type: typing.Literal["llm_as_judge"] = "llm_as_judge"
+    name: str
+    prompt: str
     output_definition: typing_extensions.Annotated[
         EvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition")
-    ] = pydantic.Field()
-    """
-    Structured output schema the evaluator must return.
-    
-    Always send `dataType`.
-    Do not send `version`; it is an internal storage detail and not part of the public request contract.
-    """
-
+    ]
     model_config_: typing_extensions.Annotated[
         typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig")
-    ] = pydantic.Field(default=None)
+    ] = None
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
+
+
+class CreateEvaluatorRequest_Code(UniversalBaseModel):
     """
-    Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model.
+    Request body for creating an evaluator.
+
+    If the same `name` already exists in your project, Langfuse creates the next version and returns it.
+    Existing evaluation rules in the same project are then moved to that new latest version automatically.
+    If `type` is omitted, Langfuse defaults it to `llm_as_judge` for backwards compatibility.
     """
 
+    type: typing.Literal["code"] = "code"
+    name: str
+    source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")]
+    source_code_language: typing_extensions.Annotated[
+        CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage")
+    ]
+
     model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
         extra="allow", frozen=True
     )
+
+
+CreateEvaluatorRequest = typing_extensions.Annotated[
+    typing.Union[CreateEvaluatorRequest_LlmAsJudge, CreateEvaluatorRequest_Code],
+    pydantic.Field(discriminator="type"),
+]
diff --git a/langfuse/api/unstable/evaluators/types/create_llm_as_judge_evaluator_request.py b/langfuse/api/unstable/evaluators/types/create_llm_as_judge_evaluator_request.py
new file mode 100644
index 000000000..09e121b1b
--- /dev/null
+++ b/langfuse/api/unstable/evaluators/types/create_llm_as_judge_evaluator_request.py
@@ -0,0 +1,43 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+import typing_extensions
+from ....core.pydantic_utilities import UniversalBaseModel
+from ....core.serialization import FieldMetadata
+from ...commons.types.evaluator_model_config import EvaluatorModelConfig
+from ...commons.types.evaluator_output_definition import EvaluatorOutputDefinition
+
+
+class CreateLlmAsJudgeEvaluatorRequest(UniversalBaseModel):
+    name: str = pydantic.Field()
+    """
+    Evaluator name within the authenticated project.
+    """
+
+    prompt: str = pydantic.Field()
+    """
+    Prompt template used by the evaluator.
+    """
+
+    output_definition: typing_extensions.Annotated[
+        EvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition")
+    ] = pydantic.Field()
+    """
+    Structured output schema the evaluator must return.
+    
+    Always send `dataType`.
+    Do not send `version`; it is an internal storage detail and not part of the public request contract.
+    """
+
+    model_config_: typing_extensions.Annotated[
+        typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig")
+    ] = pydantic.Field(default=None)
+    """
+    Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluators/types/evaluator.py b/langfuse/api/unstable/evaluators/types/evaluator.py
index 8023839fc..69295e0fd 100644
--- a/langfuse/api/unstable/evaluators/types/evaluator.py
+++ b/langfuse/api/unstable/evaluators/types/evaluator.py
@@ -1,5 +1,7 @@
 # This file was auto-generated by Fern from our API Definition.
 
+from __future__ import annotations
+
 import datetime as dt
 import typing
 
@@ -7,30 +9,27 @@
 import typing_extensions
 from ....core.pydantic_utilities import UniversalBaseModel
 from ....core.serialization import FieldMetadata
+from ...commons.types.code_evaluator_source_code_language import (
+    CodeEvaluatorSourceCodeLanguage,
+)
 from ...commons.types.evaluator_model_config import EvaluatorModelConfig
 from ...commons.types.evaluator_scope import EvaluatorScope
-from ...commons.types.evaluator_type import EvaluatorType
 from ...commons.types.public_evaluator_output_definition import (
     PublicEvaluatorOutputDefinition,
 )
 
 
-class Evaluator(UniversalBaseModel):
+class Evaluator_LlmAsJudge(UniversalBaseModel):
     """
     One evaluator that can be used for scoring.
 
-    An evaluator describes **how** to score data:
-    - prompt
-    - extracted prompt variables
-    - output schema
-    - optional explicit model configuration
+    An evaluator describes **how** to score data.
 
     It does not define **which** live objects are evaluated. That is the job of `evaluation-rules`.
 
     For agent clients, the most important fields are:
-    - `variables`: use these exact names when building the evaluation-rule `mapping` array
-    - `outputDefinition`: tells you the expected score type and the evaluator's response instructions
-    - `modelConfig`: tells you whether the evaluator uses the project default model (`null`) or an explicit provider/model
+    - `type`: determines which evaluator fields are present
+    - `variables`: for LLM evaluators, use these exact names when building the evaluation-rule `mapping` array. LLM evaluators require every variable to be mapped. Code evaluators always expose the fixed runtime payload fields and Langfuse maps them automatically.
 
     Versioning behavior:
     - `GET /evaluators` returns the latest version of each available evaluator.
@@ -38,81 +37,78 @@ class Evaluator(UniversalBaseModel):
     - Evaluation rules always run against the latest version for the selected evaluator name within the same source (`project` or `managed`).
     """
 
-    id: str = pydantic.Field()
-    """
-    Identifier of this evaluator.
-    """
+    type: typing.Literal["llm_as_judge"] = "llm_as_judge"
+    prompt: str
+    output_definition: typing_extensions.Annotated[
+        PublicEvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition")
+    ]
+    model_config_: typing_extensions.Annotated[
+        typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig")
+    ] = None
+    id: str
+    name: str
+    version: int
+    scope: EvaluatorScope
+    variables: typing.List[str]
+    evaluation_rule_count: typing_extensions.Annotated[
+        int, FieldMetadata(alias="evaluationRuleCount")
+    ]
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ]
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ]
 
-    name: str = pydantic.Field()
-    """
-    Evaluator name.
-    """
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
 
-    version: int = pydantic.Field()
-    """
-    Version number of this evaluator.
-    """
 
-    scope: EvaluatorScope = pydantic.Field()
-    """
-    Where this evaluator comes from: your project or Langfuse-managed defaults.
+class Evaluator_Code(UniversalBaseModel):
     """
+    One evaluator that can be used for scoring.
 
-    type: EvaluatorType = pydantic.Field()
-    """
-    Evaluator engine type. Currently always `llm_as_judge`.
-    """
+    An evaluator describes **how** to score data.
 
-    prompt: str = pydantic.Field()
-    """
-    Prompt template used during evaluation.
-    """
-
-    variables: typing.List[str] = pydantic.Field()
-    """
-    Variables extracted from the evaluator prompt.
-    
-    Every variable in this list must be mapped exactly once when creating an evaluation rule.
-    """
+    It does not define **which** live objects are evaluated. That is the job of `evaluation-rules`.
 
-    output_definition: typing_extensions.Annotated[
-        PublicEvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition")
-    ] = pydantic.Field()
-    """
-    Structured output schema returned by this evaluator.
-    
-    Responses always include `dataType` and omit the internal output-definition `version`.
-    Use `dataType` to decide how future scores should be interpreted.
-    """
+    For agent clients, the most important fields are:
+    - `type`: determines which evaluator fields are present
+    - `variables`: for LLM evaluators, use these exact names when building the evaluation-rule `mapping` array. LLM evaluators require every variable to be mapped. Code evaluators always expose the fixed runtime payload fields and Langfuse maps them automatically.
 
-    model_config_: typing_extensions.Annotated[
-        typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig")
-    ] = pydantic.Field(default=None)
-    """
-    Explicit model configuration, or `null` when the project default evaluation model is used.
+    Versioning behavior:
+    - `GET /evaluators` returns the latest version of each available evaluator.
+    - `GET /evaluators/{id}` can return an older version.
+    - Evaluation rules always run against the latest version for the selected evaluator name within the same source (`project` or `managed`).
     """
 
+    type: typing.Literal["code"] = "code"
+    source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")]
+    source_code_language: typing_extensions.Annotated[
+        CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage")
+    ]
+    id: str
+    name: str
+    version: int
+    scope: EvaluatorScope
+    variables: typing.List[str]
     evaluation_rule_count: typing_extensions.Annotated[
         int, FieldMetadata(alias="evaluationRuleCount")
-    ] = pydantic.Field()
-    """
-    Number of evaluation rules in the project that currently use this evaluator version.
-    """
-
+    ]
     created_at: typing_extensions.Annotated[
         dt.datetime, FieldMetadata(alias="createdAt")
-    ] = pydantic.Field()
-    """
-    Timestamp when this evaluator was created.
-    """
-
+    ]
     updated_at: typing_extensions.Annotated[
         dt.datetime, FieldMetadata(alias="updatedAt")
-    ] = pydantic.Field()
-    """
-    Timestamp when this evaluator was last updated.
-    """
+    ]
 
     model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
         extra="allow", frozen=True
     )
+
+
+Evaluator = typing_extensions.Annotated[
+    typing.Union[Evaluator_LlmAsJudge, Evaluator_Code],
+    pydantic.Field(discriminator="type"),
+]
diff --git a/langfuse/api/unstable/evaluators/types/evaluator_base.py b/langfuse/api/unstable/evaluators/types/evaluator_base.py
new file mode 100644
index 000000000..7a8362657
--- /dev/null
+++ b/langfuse/api/unstable/evaluators/types/evaluator_base.py
@@ -0,0 +1,64 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import datetime as dt
+import typing
+
+import pydantic
+import typing_extensions
+from ....core.pydantic_utilities import UniversalBaseModel
+from ....core.serialization import FieldMetadata
+from ...commons.types.evaluator_scope import EvaluatorScope
+
+
+class EvaluatorBase(UniversalBaseModel):
+    id: str = pydantic.Field()
+    """
+    Identifier of this evaluator.
+    """
+
+    name: str = pydantic.Field()
+    """
+    Evaluator name.
+    """
+
+    version: int = pydantic.Field()
+    """
+    Version number of this evaluator.
+    """
+
+    scope: EvaluatorScope = pydantic.Field()
+    """
+    Where this evaluator comes from: your project or Langfuse-managed defaults.
+    """
+
+    variables: typing.List[str] = pydantic.Field()
+    """
+    Variables that can be mapped when creating an evaluation rule.
+    
+    LLM evaluators require every variable to be mapped exactly once. Code evaluators always expose the fixed runtime payload fields and Langfuse maps them automatically.
+    """
+
+    evaluation_rule_count: typing_extensions.Annotated[
+        int, FieldMetadata(alias="evaluationRuleCount")
+    ] = pydantic.Field()
+    """
+    Number of evaluation rules in the project that currently use this evaluator version.
+    """
+
+    created_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="createdAt")
+    ] = pydantic.Field()
+    """
+    Timestamp when this evaluator was created.
+    """
+
+    updated_at: typing_extensions.Annotated[
+        dt.datetime, FieldMetadata(alias="updatedAt")
+    ] = pydantic.Field()
+    """
+    Timestamp when this evaluator was last updated.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )
diff --git a/langfuse/api/unstable/evaluators/types/llm_as_judge_evaluator.py b/langfuse/api/unstable/evaluators/types/llm_as_judge_evaluator.py
new file mode 100644
index 000000000..0cf186f47
--- /dev/null
+++ b/langfuse/api/unstable/evaluators/types/llm_as_judge_evaluator.py
@@ -0,0 +1,40 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+import pydantic
+import typing_extensions
+from ....core.serialization import FieldMetadata
+from ...commons.types.evaluator_model_config import EvaluatorModelConfig
+from ...commons.types.public_evaluator_output_definition import (
+    PublicEvaluatorOutputDefinition,
+)
+from .evaluator_base import EvaluatorBase
+
+
+class LlmAsJudgeEvaluator(EvaluatorBase):
+    prompt: str = pydantic.Field()
+    """
+    Prompt template used during evaluation.
+    """
+
+    output_definition: typing_extensions.Annotated[
+        PublicEvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition")
+    ] = pydantic.Field()
+    """
+    Structured output schema returned by this evaluator.
+    
+    Responses always include `dataType` and omit the internal output-definition `version`.
+    Use `dataType` to decide how future scores should be interpreted.
+    """
+
+    model_config_: typing_extensions.Annotated[
+        typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig")
+    ] = pydantic.Field(default=None)
+    """
+    Explicit model configuration, or `null` when the project default evaluation model is used.
+    """
+
+    model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
+        extra="allow", frozen=True
+    )