Example: Simple Validation

A proof of concept example of evaluating a simple text transformation function with deterministic checks.

Scenario

We're testing a function that converts text to title case. We want to verify:

Output is always a string
Output matches expected format
Function handles edge cases correctly
Performance meets requirements

Complete Example

from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import (
    Contains,
    EqualsExpected,
    IsInstance,
    MaxDuration,
)


# The function we're testing
def to_title_case(text: str) -> str:
    """Convert text to title case."""
    return text.title()


# Create evaluation dataset
dataset = Dataset(
    name='title_case_validation',
    cases=[
        # Basic functionality
        Case(
            name='basic_lowercase',
            inputs='hello world',
            expected_output='Hello World',
        ),
        Case(
            name='basic_uppercase',
            inputs='HELLO WORLD',
            expected_output='Hello World',
        ),
        Case(
            name='mixed_case',
            inputs='HeLLo WoRLd',
            expected_output='Hello World',
        ),

        # Edge cases
        Case(
            name='empty_string',
            inputs='',
            expected_output='',
        ),
        Case(
            name='single_word',
            inputs='hello',
            expected_output='Hello',
        ),
        Case(
            name='with_punctuation',
            inputs='hello, world!',
            expected_output='Hello, World!',
        ),
        Case(
            name='with_numbers',
            inputs='hello 123 world',
            expected_output='Hello 123 World',
        ),
        Case(
            name='apostrophes',
            inputs="don't stop believin'",
            expected_output="Don'T Stop Believin'",
        ),
    ],
    evaluators=[
        # Always returns a string
        IsInstance(type_name='str'),

        # Matches expected output
        EqualsExpected(),

        # Output should contain capital letters
        Contains(value='H', evaluation_name='has_capitals'),

        # Should be fast (under 1ms)
        MaxDuration(seconds=0.001),
    ],
)


# Run evaluation
if __name__ == '__main__':
    report = dataset.evaluate_sync(to_title_case)

    # Print results
    report.print(include_input=True, include_output=True)
"""
                            Evaluation Summary: to_title_case
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Case ID          ┃ Inputs               ┃ Outputs              ┃ Assertions ┃ Duration ┃
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
│ basic_lowercase  │ hello world          │ Hello World          │ ✔✔✔✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ basic_uppercase  │ HELLO WORLD          │ Hello World          │ ✔✔✔✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ mixed_case       │ HeLLo WoRLd          │ Hello World          │ ✔✔✔✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ empty_string     │ -                    │ -                    │ ✔✔✗✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ single_word      │ hello                │ Hello                │ ✔✔✔✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ with_punctuation │ hello, world!        │ Hello, World!        │ ✔✔✔✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ with_numbers     │ hello 123 world      │ Hello 123 World      │ ✔✔✔✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ apostrophes      │ don't stop believin' │ Don'T Stop Believin' │ ✔✔✗✗       │     10ms │
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
│ Averages         │                      │                      │ 68.8% ✔    │     10ms │
└──────────────────┴──────────────────────┴──────────────────────┴────────────┴──────────┘
"""
# Check if all passed
avg = report.averages()
if avg and avg.assertions == 1.0:
    print('\n✅ All tests passed!')
else:
    print(f'\n❌ Some tests failed (pass rate: {avg.assertions:.1%})')
    """
    ❌ Some tests failed (pass rate: 68.8%)
    """

Expected Output

                        Evaluation Summary: to_title_case
┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Case ID           ┃ Inputs               ┃ Outputs               ┃ Assertions ┃ Duration ┃
┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
│ basic_lowercase   │ hello world          │ Hello World           │ ✔✔✔✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ basic_uppercase   │ HELLO WORLD          │ Hello World           │ ✔✔✔✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ mixed_case        │ HeLLo WoRLd          │ Hello World           │ ✔✔✔✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ empty_string      │                      │                       │ ✔✔✗✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ single_word       │ hello                │ Hello                 │ ✔✔✔✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ with_punctuation  │ hello, world!        │ Hello, World!         │ ✔✔✔✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ with_numbers      │ hello 123 world      │ Hello 123 World       │ ✔✔✔✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ apostrophes       │ don't stop believin' │ Don'T Stop Believin'  │ ✔✔✔✔       │      <1ms│
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
│ Averages          │                      │                       │ 96.9% ✔    │      <1ms│
└───────────────────┴──────────────────────┴───────────────────────┴────────────┴──────────┘

✅ All tests passed!

Note: The empty_string case has one failed assertion (has_capitals) because an empty string contains no capital letters.

Saving and Loading

Save the dataset for future use:

from typing import Any

from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EqualsExpected


# The function we're testing
def to_title_case(text: str) -> str:
    """Convert text to title case."""
    return text.title()


# Create dataset
dataset: Dataset[str, str, Any] = Dataset(
    cases=[Case(inputs='test', expected_output='Test')],
    evaluators=[EqualsExpected()],
)

# Save to YAML
dataset.to_file('title_case_tests.yaml')

# Load later
dataset = Dataset.from_file('title_case_tests.yaml')
report = dataset.evaluate_sync(to_title_case)

Adding More Cases

As you find bugs or edge cases, add them to the dataset:

from pydantic_evals import Dataset

# Load existing dataset
dataset = Dataset.from_file('title_case_tests.yaml')

# Found a bug with unicode
dataset.add_case(
    name='unicode_chars',
    inputs='café résumé',
    expected_output='Café Résumé',
)

# Found a bug with all caps words
dataset.add_case(
    name='acronyms',
    inputs='the USA and FBI',
    expected_output='The Usa And Fbi',  # Python's title() behavior
)

# Test with very long input
dataset.add_case(
    name='long_input',
    inputs=' '.join(['word'] * 1000),
    expected_output=' '.join(['Word'] * 1000),
)

# Save updated dataset
dataset.to_file('title_case_tests.yaml')

Using with pytest

Integrate with pytest for CI/CD:

import pytest

from pydantic_evals import Dataset


# The function we're testing
def to_title_case(text: str) -> str:
    """Convert text to title case."""
    return text.title()


@pytest.fixture
def title_case_dataset():
    return Dataset.from_file('title_case_tests.yaml')


def test_title_case_evaluation(title_case_dataset):
    """Run evaluation tests."""
    report = title_case_dataset.evaluate_sync(to_title_case)

    # All cases should pass
    avg = report.averages()
    assert avg is not None
    assert avg.assertions == 1.0, f'Some tests failed (pass rate: {avg.assertions:.1%})'


def test_title_case_performance(title_case_dataset):
    """Verify performance."""
    report = title_case_dataset.evaluate_sync(to_title_case)

    # All cases should complete quickly
    for case in report.cases:
        assert case.task_duration < 0.001, f'{case.name} took {case.task_duration}s'

Next Steps

Built-in Evaluators - Explore all available evaluators
Custom Evaluators - Write your own evaluation logic
Dataset Management - Save, load, and manage datasets
Concurrency & Performance - Optimize evaluation performance