Skip to content

pydantic_evals.evaluators

Contains dataclass

Bases: Evaluator[object, object, object]

Check if the output contains the expected output.

For strings, checks if expected_output is a substring of output. For lists/tuples, checks if expected_output is in output. For dicts, checks if all key-value pairs in expected_output are in output.

Note: case_sensitive only applies when both the value and output are strings.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@dataclass
class Contains(Evaluator[object, object, object]):
    """Check if the output contains the expected output.

    For strings, checks if expected_output is a substring of output.
    For lists/tuples, checks if expected_output is in output.
    For dicts, checks if all key-value pairs in expected_output are in output.

    Note: case_sensitive only applies when both the value and output are strings.
    """

    value: Any
    case_sensitive: bool = True
    as_strings: bool = False

    def evaluate(
        self,
        ctx: EvaluatorContext[object, object, object],
    ) -> EvaluationReason:
        # Convert objects to strings if requested
        failure_reason: str | None = None
        as_strings = self.as_strings or (isinstance(self.value, str) and isinstance(ctx.output, str))
        if as_strings:
            output_str = str(ctx.output)
            expected_str = str(self.value)

            if not self.case_sensitive:
                output_str = output_str.lower()
                expected_str = expected_str.lower()

            failure_reason: str | None = None
            if expected_str not in output_str:
                output_trunc = _truncated_repr(output_str, max_length=100)
                expected_trunc = _truncated_repr(expected_str, max_length=100)
                failure_reason = f'Output string {output_trunc} does not contain expected string {expected_trunc}'
            return EvaluationReason(value=failure_reason is None, reason=failure_reason)

        try:
            # Handle different collection types
            if isinstance(ctx.output, dict):
                if isinstance(self.value, dict):
                    # Cast to Any to avoid type checking issues
                    output_dict = cast(dict[Any, Any], ctx.output)  # pyright: ignore[reportUnknownMemberType]
                    expected_dict = cast(dict[Any, Any], self.value)  # pyright: ignore[reportUnknownMemberType]
                    for k in expected_dict:
                        if k not in output_dict:
                            k_trunc = _truncated_repr(k, max_length=30)
                            failure_reason = f'Output dictionary does not contain expected key {k_trunc}'
                            break
                        elif output_dict[k] != expected_dict[k]:
                            k_trunc = _truncated_repr(k, max_length=30)
                            output_v_trunc = _truncated_repr(output_dict[k], max_length=100)
                            expected_v_trunc = _truncated_repr(expected_dict[k], max_length=100)
                            failure_reason = f'Output dictionary has different value for key {k_trunc}: {output_v_trunc} != {expected_v_trunc}'
                            break
                else:
                    if self.value not in ctx.output:  # pyright: ignore[reportUnknownMemberType]
                        output_trunc = _truncated_repr(ctx.output, max_length=200)  # pyright: ignore[reportUnknownMemberType]
                        failure_reason = f'Output {output_trunc} does not contain provided value as a key'
            elif self.value not in ctx.output:  # pyright: ignore[reportOperatorIssue]  # will be handled by except block
                output_trunc = _truncated_repr(ctx.output, max_length=200)
                failure_reason = f'Output {output_trunc} does not contain provided value'
        except (TypeError, ValueError) as e:
            failure_reason = f'Containment check failed: {e}'

        return EvaluationReason(value=failure_reason is None, reason=failure_reason)

Equals dataclass

Bases: Evaluator[object, object, object]

Check if the output exactly equals the provided value.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
25
26
27
28
29
30
31
32
@dataclass
class Equals(Evaluator[object, object, object]):
    """Check if the output exactly equals the provided value."""

    value: Any

    def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> bool:
        return ctx.output == self.value

EqualsExpected dataclass

Bases: Evaluator[object, object, object]

Check if the output exactly equals the expected output.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
35
36
37
38
39
40
41
42
@dataclass
class EqualsExpected(Evaluator[object, object, object]):
    """Check if the output exactly equals the expected output."""

    def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> bool | dict[str, bool]:
        if ctx.expected_output is None:
            return {}  # Only compare if expected output is provided
        return ctx.output == ctx.expected_output

HasMatchingSpan dataclass

Bases: Evaluator[object, object, object]

Check if the span tree contains a span that matches the specified query.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
179
180
181
182
183
184
185
186
187
188
189
@dataclass
class HasMatchingSpan(Evaluator[object, object, object]):
    """Check if the span tree contains a span that matches the specified query."""

    query: SpanQuery

    def evaluate(
        self,
        ctx: EvaluatorContext[object, object, object],
    ) -> bool:
        return ctx.span_tree.any(self.query)

IsInstance dataclass

Bases: Evaluator[object, object, object]

Check if the output is an instance of a type with the given name.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
@dataclass
class IsInstance(Evaluator[object, object, object]):
    """Check if the output is an instance of a type with the given name."""

    type_name: str

    def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluationReason:
        output = ctx.output
        for cls in type(output).__mro__:
            if cls.__name__ == self.type_name or cls.__qualname__ == self.type_name:
                return EvaluationReason(value=True)

        reason = f'output is of type {type(output).__name__}'
        if type(output).__qualname__ != type(output).__name__:
            reason += f' (qualname: {type(output).__qualname__})'
        return EvaluationReason(value=False, reason=reason)

LLMJudge dataclass

Bases: Evaluator[object, object, object]

Judge whether the output of a language model meets the criteria of a provided rubric.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
@dataclass
class LLMJudge(Evaluator[object, object, object]):
    """Judge whether the output of a language model meets the criteria of a provided rubric."""

    rubric: str
    model: models.Model | models.KnownModelName = 'openai:gpt-4o'
    include_input: bool = False

    async def evaluate(
        self,
        ctx: EvaluatorContext[object, object, object],
    ) -> EvaluationReason:
        if self.include_input:
            from .llm_as_a_judge import judge_input_output

            grading_output = await judge_input_output(ctx.inputs, ctx.output, self.rubric, self.model)
        else:
            from .llm_as_a_judge import judge_output

            grading_output = await judge_output(ctx.output, self.rubric, self.model)
        return EvaluationReason(value=grading_output.pass_, reason=grading_output.reason)

MaxDuration dataclass

Bases: Evaluator[object, object, object]

Check if the execution time is under the specified maximum.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
142
143
144
145
146
147
148
149
150
151
152
153
@dataclass
class MaxDuration(Evaluator[object, object, object]):
    """Check if the execution time is under the specified maximum."""

    seconds: float | timedelta

    def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> bool:
        duration = timedelta(seconds=ctx.duration)
        seconds = self.seconds
        if not isinstance(seconds, timedelta):
            seconds = timedelta(seconds=seconds)
        return duration <= seconds

Python dataclass

Bases: Evaluator[object, object, object]

The output of this evaluator is the result of evaluating the provided Python expression.

WARNING: this evaluator runs arbitrary Python code, so you should NEVER use it with untrusted inputs.

Source code in pydantic_evals/pydantic_evals/evaluators/common.py
193
194
195
196
197
198
199
200
201
202
203
204
@dataclass
class Python(Evaluator[object, object, object]):
    """The output of this evaluator is the result of evaluating the provided Python expression.

    ***WARNING***: this evaluator runs arbitrary Python code, so you should ***NEVER*** use it with untrusted inputs.
    """

    expression: str

    def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
        # Evaluate the condition, exposing access to the evaluator context as `ctx`.
        return eval(self.expression, {'ctx': ctx})

EvaluatorContext dataclass

Bases: Generic[InputsT, OutputT, MetadataT]

Context for evaluating a task execution.

An instance of this class is the sole input to all Evaluators. It contains all the information needed to evaluate the task execution, including inputs, outputs, metadata, and telemetry data.

Evaluators use this context to access the task inputs, actual output, expected output, and other information when evaluating the result of the task execution.

Example:

from dataclasses import dataclass

from pydantic_evals.evaluators import Evaluator, EvaluatorContext


@dataclass
class ExactMatch(Evaluator):
    def evaluate(self, ctx: EvaluatorContext) -> bool:
        # Use the context to access task inputs, outputs, and expected outputs
        return ctx.output == ctx.expected_output

Source code in pydantic_evals/pydantic_evals/evaluators/context.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
@dataclass
class EvaluatorContext(Generic[InputsT, OutputT, MetadataT]):
    """Context for evaluating a task execution.

    An instance of this class is the sole input to all Evaluators. It contains all the information
    needed to evaluate the task execution, including inputs, outputs, metadata, and telemetry data.

    Evaluators use this context to access the task inputs, actual output, expected output, and other
    information when evaluating the result of the task execution.

    Example:
    ```python
    from dataclasses import dataclass

    from pydantic_evals.evaluators import Evaluator, EvaluatorContext


    @dataclass
    class ExactMatch(Evaluator):
        def evaluate(self, ctx: EvaluatorContext) -> bool:
            # Use the context to access task inputs, outputs, and expected outputs
            return ctx.output == ctx.expected_output
    ```
    """

    name: str | None
    """The name of the case."""
    inputs: InputsT
    """The inputs provided to the task for this case."""
    metadata: MetadataT | None
    """Metadata associated with the case, if provided. May be None if no metadata was specified."""
    expected_output: OutputT | None
    """The expected output for the case, if provided. May be None if no expected output was specified."""

    output: OutputT
    """The actual output produced by the task for this case."""
    duration: float
    """The duration of the task run for this case."""
    _span_tree: SpanTree | SpanTreeRecordingError = field(repr=False)
    """The span tree for the task run for this case.

    This will be `None` if `logfire.configure` has not been called.
    """

    attributes: dict[str, Any]
    """Attributes associated with the task run for this case.

    These can be set by calling `pydantic_evals.dataset.set_eval_attribute` in any code executed
    during the evaluation task."""
    metrics: dict[str, int | float]
    """Metrics associated with the task run for this case.

    These can be set by calling `pydantic_evals.dataset.increment_eval_metric` in any code executed
    during the evaluation task."""

    @property
    def span_tree(self) -> SpanTree:
        """Get the `SpanTree` for this task execution.

        The span tree is a graph where each node corresponds to an OpenTelemetry span recorded during the task
        execution, including timing information and any custom spans created during execution.

        Returns:
            The span tree for the task execution.

        Raises:
            SpanTreeRecordingError: If spans were not captured during execution of the task, e.g. due to not having
                the necessary dependencies installed.
        """
        if isinstance(self._span_tree, SpanTreeRecordingError):
            # In this case, there was a reason we couldn't record the SpanTree. We raise that now
            raise self._span_tree
        return self._span_tree

name instance-attribute

name: str | None

The name of the case.

inputs instance-attribute

inputs: InputsT

The inputs provided to the task for this case.

metadata instance-attribute

metadata: MetadataT | None

Metadata associated with the case, if provided. May be None if no metadata was specified.

expected_output instance-attribute

expected_output: OutputT | None

The expected output for the case, if provided. May be None if no expected output was specified.

output instance-attribute

output: OutputT

The actual output produced by the task for this case.

duration instance-attribute

duration: float

The duration of the task run for this case.

attributes instance-attribute

attributes: dict[str, Any]

Attributes associated with the task run for this case.

These can be set by calling pydantic_evals.dataset.set_eval_attribute in any code executed during the evaluation task.

metrics instance-attribute

metrics: dict[str, int | float]

Metrics associated with the task run for this case.

These can be set by calling pydantic_evals.dataset.increment_eval_metric in any code executed during the evaluation task.

span_tree property

span_tree: SpanTree

Get the SpanTree for this task execution.

The span tree is a graph where each node corresponds to an OpenTelemetry span recorded during the task execution, including timing information and any custom spans created during execution.

Returns:

Type Description
SpanTree

The span tree for the task execution.

Raises:

Type Description
SpanTreeRecordingError

If spans were not captured during execution of the task, e.g. due to not having the necessary dependencies installed.

EvaluationReason dataclass

The result of running an evaluator with an optional explanation.

Contains a scalar value and an optional "reason" explaining the value.

Parameters:

Name Type Description Default
value EvaluationScalar

The scalar result of the evaluation (boolean, integer, float, or string).

required
reason str | None

An optional explanation of the evaluation result.

None
Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
36
37
38
39
40
41
42
43
44
45
46
47
48
@dataclass
class EvaluationReason:
    """The result of running an evaluator with an optional explanation.

    Contains a scalar value and an optional "reason" explaining the value.

    Args:
        value: The scalar result of the evaluation (boolean, integer, float, or string).
        reason: An optional explanation of the evaluation result.
    """

    value: EvaluationScalar
    reason: str | None = None

EvaluationResult dataclass

Bases: Generic[EvaluationScalarT]

The details of an individual evaluation result.

Contains the name, value, reason, and source evaluator for a single evaluation.

Parameters:

Name Type Description Default
name str

The name of the evaluation.

required
value EvaluationScalarT

The scalar result of the evaluation.

required
reason str | None

An optional explanation of the evaluation result.

required
source Evaluator

The evaluator that produced this result.

required
Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
@dataclass
class EvaluationResult(Generic[EvaluationScalarT]):
    """The details of an individual evaluation result.

    Contains the name, value, reason, and source evaluator for a single evaluation.

    Args:
        name: The name of the evaluation.
        value: The scalar result of the evaluation.
        reason: An optional explanation of the evaluation result.
        source: The evaluator that produced this result.
    """

    name: str
    value: EvaluationScalarT
    reason: str | None
    source: Evaluator

    def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None:
        """Attempt to downcast this result to a more specific type.

        Args:
            *value_types: The types to check the value against.

        Returns:
            A downcast version of this result if the value is an instance of one of the given types,
            otherwise None.
        """
        # Check if value matches any of the target types, handling bool as a special case
        for value_type in value_types:
            if isinstance(self.value, value_type):
                # Only match bool with explicit bool type
                if isinstance(self.value, bool) and value_type is not bool:
                    continue
                return cast(EvaluationResult[T], self)
        return None

downcast

downcast(
    *value_types: type[T],
) -> EvaluationResult[T] | None

Attempt to downcast this result to a more specific type.

Parameters:

Name Type Description Default
*value_types type[T]

The types to check the value against.

()

Returns:

Type Description
EvaluationResult[T] | None

A downcast version of this result if the value is an instance of one of the given types,

EvaluationResult[T] | None

otherwise None.

Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None:
    """Attempt to downcast this result to a more specific type.

    Args:
        *value_types: The types to check the value against.

    Returns:
        A downcast version of this result if the value is an instance of one of the given types,
        otherwise None.
    """
    # Check if value matches any of the target types, handling bool as a special case
    for value_type in value_types:
        if isinstance(self.value, value_type):
            # Only match bool with explicit bool type
            if isinstance(self.value, bool) and value_type is not bool:
                continue
            return cast(EvaluationResult[T], self)
    return None

Evaluator dataclass

Bases: Generic[InputsT, OutputT, MetadataT]

Base class for all evaluators.

Evaluators can assess the performance of a task in a variety of ways, as a function of the EvaluatorContext.

Subclasses must implement the evaluate method. Note it can be defined with either def or async def.

Example:

from dataclasses import dataclass

from pydantic_evals.evaluators import Evaluator, EvaluatorContext


@dataclass
class ExactMatch(Evaluator):
    def evaluate(self, ctx: EvaluatorContext) -> bool:
        return ctx.output == ctx.expected_output

Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
@dataclass
class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta):
    """Base class for all evaluators.

    Evaluators can assess the performance of a task in a variety of ways, as a function of the EvaluatorContext.

    Subclasses must implement the `evaluate` method. Note it can be defined with either `def` or `async def`.

    Example:
    ```python
    from dataclasses import dataclass

    from pydantic_evals.evaluators import Evaluator, EvaluatorContext


    @dataclass
    class ExactMatch(Evaluator):
        def evaluate(self, ctx: EvaluatorContext) -> bool:
            return ctx.output == ctx.expected_output
    ```
    """

    __pydantic_config__ = ConfigDict(arbitrary_types_allowed=True)

    @classmethod
    def name(cls) -> str:
        """Return the 'name' of this Evaluator to use during serialization.

        Returns:
            The name of the Evaluator, which is typically the class name.
        """
        # Note: if we wanted to prefer snake_case, we could use:
        # from pydantic.alias_generators import to_snake
        # return to_snake(cls.__name__)
        return cls.__name__

    @abstractmethod
    def evaluate(
        self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]
    ) -> EvaluatorOutput | Awaitable[EvaluatorOutput]:  # pragma: no cover
        """Evaluate the task output in the given context.

        This is the main evaluation method that subclasses must implement. It can be either synchronous
        or asynchronous, returning either an EvaluatorOutput directly or an Awaitable[EvaluatorOutput].

        Args:
            ctx: The context containing the inputs, outputs, and metadata for evaluation.

        Returns:
            The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping
            of evaluation names to either of those. Can be returned either synchronously or as an
            awaitable for asynchronous evaluation.
        """
        raise NotImplementedError('You must implement `evaluate`.')

    def evaluate_sync(self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]) -> EvaluatorOutput:
        """Run the evaluator synchronously, handling both sync and async implementations.

        This method ensures synchronous execution by running any async evaluate implementation
        to completion using run_until_complete.

        Args:
            ctx: The context containing the inputs, outputs, and metadata for evaluation.

        Returns:
            The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping
            of evaluation names to either of those.
        """
        output = self.evaluate(ctx)
        if inspect.iscoroutine(output):  # pragma: no cover
            return get_event_loop().run_until_complete(output)
        else:
            return cast(EvaluatorOutput, output)

    async def evaluate_async(self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]) -> EvaluatorOutput:
        """Run the evaluator asynchronously, handling both sync and async implementations.

        This method ensures asynchronous execution by properly awaiting any async evaluate
        implementation. For synchronous implementations, it returns the result directly.

        Args:
            ctx: The context containing the inputs, outputs, and metadata for evaluation.

        Returns:
            The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping
            of evaluation names to either of those.
        """
        # Note: If self.evaluate is synchronous, but you need to prevent this from blocking, override this method with:
        # return await anyio.to_thread.run_sync(self.evaluate, ctx)
        output = self.evaluate(ctx)
        if inspect.iscoroutine(output):
            return await output
        else:
            return cast(EvaluatorOutput, output)

    @model_serializer(mode='plain')
    def serialize(self, info: SerializationInfo) -> Any:
        """Serialize this Evaluator to a JSON-serializable form.

        Returns:
            A JSON-serializable representation of this evaluator as an EvaluatorSpec.
        """
        raw_arguments: dict[str, Any] = {}
        for field in fields(self):
            value = getattr(self, field.name)
            # always exclude defaults:
            if field.default is not MISSING:
                if value == field.default:
                    continue
            if field.default_factory is not MISSING:
                if value == field.default_factory():
                    continue
            raw_arguments[field.name] = value

        arguments: None | tuple[Any,] | dict[str, Any]
        if len(raw_arguments) == 0:
            arguments = None
        elif len(raw_arguments) == 1:
            arguments = (next(iter(raw_arguments.values())),)
        else:
            arguments = raw_arguments
        return to_jsonable_python(EvaluatorSpec(name=self.name(), arguments=arguments), context=info.context)

name classmethod

name() -> str

Return the 'name' of this Evaluator to use during serialization.

Returns:

Type Description
str

The name of the Evaluator, which is typically the class name.

Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
148
149
150
151
152
153
154
155
156
157
158
@classmethod
def name(cls) -> str:
    """Return the 'name' of this Evaluator to use during serialization.

    Returns:
        The name of the Evaluator, which is typically the class name.
    """
    # Note: if we wanted to prefer snake_case, we could use:
    # from pydantic.alias_generators import to_snake
    # return to_snake(cls.__name__)
    return cls.__name__

evaluate abstractmethod

evaluate(
    ctx: EvaluatorContext[InputsT, OutputT, MetadataT],
) -> EvaluatorOutput | Awaitable[EvaluatorOutput]

Evaluate the task output in the given context.

This is the main evaluation method that subclasses must implement. It can be either synchronous or asynchronous, returning either an EvaluatorOutput directly or an Awaitable[EvaluatorOutput].

Parameters:

Name Type Description Default
ctx EvaluatorContext[InputsT, OutputT, MetadataT]

The context containing the inputs, outputs, and metadata for evaluation.

required

Returns:

Type Description
EvaluatorOutput | Awaitable[EvaluatorOutput]

The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping

EvaluatorOutput | Awaitable[EvaluatorOutput]

of evaluation names to either of those. Can be returned either synchronously or as an

EvaluatorOutput | Awaitable[EvaluatorOutput]

awaitable for asynchronous evaluation.

Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@abstractmethod
def evaluate(
    self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]
) -> EvaluatorOutput | Awaitable[EvaluatorOutput]:  # pragma: no cover
    """Evaluate the task output in the given context.

    This is the main evaluation method that subclasses must implement. It can be either synchronous
    or asynchronous, returning either an EvaluatorOutput directly or an Awaitable[EvaluatorOutput].

    Args:
        ctx: The context containing the inputs, outputs, and metadata for evaluation.

    Returns:
        The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping
        of evaluation names to either of those. Can be returned either synchronously or as an
        awaitable for asynchronous evaluation.
    """
    raise NotImplementedError('You must implement `evaluate`.')

evaluate_sync

evaluate_sync(
    ctx: EvaluatorContext[InputsT, OutputT, MetadataT],
) -> EvaluatorOutput

Run the evaluator synchronously, handling both sync and async implementations.

This method ensures synchronous execution by running any async evaluate implementation to completion using run_until_complete.

Parameters:

Name Type Description Default
ctx EvaluatorContext[InputsT, OutputT, MetadataT]

The context containing the inputs, outputs, and metadata for evaluation.

required

Returns:

Type Description
EvaluatorOutput

The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping

EvaluatorOutput

of evaluation names to either of those.

Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def evaluate_sync(self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]) -> EvaluatorOutput:
    """Run the evaluator synchronously, handling both sync and async implementations.

    This method ensures synchronous execution by running any async evaluate implementation
    to completion using run_until_complete.

    Args:
        ctx: The context containing the inputs, outputs, and metadata for evaluation.

    Returns:
        The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping
        of evaluation names to either of those.
    """
    output = self.evaluate(ctx)
    if inspect.iscoroutine(output):  # pragma: no cover
        return get_event_loop().run_until_complete(output)
    else:
        return cast(EvaluatorOutput, output)

evaluate_async async

evaluate_async(
    ctx: EvaluatorContext[InputsT, OutputT, MetadataT],
) -> EvaluatorOutput

Run the evaluator asynchronously, handling both sync and async implementations.

This method ensures asynchronous execution by properly awaiting any async evaluate implementation. For synchronous implementations, it returns the result directly.

Parameters:

Name Type Description Default
ctx EvaluatorContext[InputsT, OutputT, MetadataT]

The context containing the inputs, outputs, and metadata for evaluation.

required

Returns:

Type Description
EvaluatorOutput

The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping

EvaluatorOutput

of evaluation names to either of those.

Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
async def evaluate_async(self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]) -> EvaluatorOutput:
    """Run the evaluator asynchronously, handling both sync and async implementations.

    This method ensures asynchronous execution by properly awaiting any async evaluate
    implementation. For synchronous implementations, it returns the result directly.

    Args:
        ctx: The context containing the inputs, outputs, and metadata for evaluation.

    Returns:
        The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping
        of evaluation names to either of those.
    """
    # Note: If self.evaluate is synchronous, but you need to prevent this from blocking, override this method with:
    # return await anyio.to_thread.run_sync(self.evaluate, ctx)
    output = self.evaluate(ctx)
    if inspect.iscoroutine(output):
        return await output
    else:
        return cast(EvaluatorOutput, output)

serialize

serialize(info: SerializationInfo) -> Any

Serialize this Evaluator to a JSON-serializable form.

Returns:

Type Description
Any

A JSON-serializable representation of this evaluator as an EvaluatorSpec.

Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
@model_serializer(mode='plain')
def serialize(self, info: SerializationInfo) -> Any:
    """Serialize this Evaluator to a JSON-serializable form.

    Returns:
        A JSON-serializable representation of this evaluator as an EvaluatorSpec.
    """
    raw_arguments: dict[str, Any] = {}
    for field in fields(self):
        value = getattr(self, field.name)
        # always exclude defaults:
        if field.default is not MISSING:
            if value == field.default:
                continue
        if field.default_factory is not MISSING:
            if value == field.default_factory():
                continue
        raw_arguments[field.name] = value

    arguments: None | tuple[Any,] | dict[str, Any]
    if len(raw_arguments) == 0:
        arguments = None
    elif len(raw_arguments) == 1:
        arguments = (next(iter(raw_arguments.values())),)
    else:
        arguments = raw_arguments
    return to_jsonable_python(EvaluatorSpec(name=self.name(), arguments=arguments), context=info.context)

EvaluatorOutput module-attribute

EvaluatorOutput = Union[
    EvaluationScalar,
    EvaluationReason,
    Mapping[str, Union[EvaluationScalar, EvaluationReason]],
]

Type for the output of an evaluator, which can be a scalar, an EvaluationReason, or a mapping of names to either.