Metrics & Attributes
Track custom metrics and attributes during task execution for richer evaluation insights.
Overview
While executing evaluation tasks, you can record:
- Metrics - Numeric values (int/float) for quantitative measurements
- Attributes - Any data for qualitative information
These appear in evaluation reports and can be used by evaluators for assessment.
Recording Metrics
Use increment_eval_metric to track numeric values:
from dataclasses import dataclass
from pydantic_evals.dataset import increment_eval_metric
@dataclass
class APIResult:
output: str
usage: 'Usage'
@dataclass
class Usage:
total_tokens: int
def call_api(inputs: str) -> APIResult:
return APIResult(output=f'Result: {inputs}', usage=Usage(total_tokens=100))
def my_task(inputs: str) -> str:
# Track API calls
increment_eval_metric('api_calls', 1)
result = call_api(inputs)
# Track tokens used
increment_eval_metric('tokens_used', result.usage.total_tokens)
return result.output
Recording Attributes
Use set_eval_attribute to store any data:
from pydantic_evals import set_eval_attribute
def process(inputs: str) -> str:
return f'Processed: {inputs}'
def my_task(inputs: str) -> str:
# Record which model was used
set_eval_attribute('model', 'gpt-4o')
# Record feature flags
set_eval_attribute('used_cache', True)
set_eval_attribute('retry_count', 2)
# Record structured data
set_eval_attribute('config', {
'temperature': 0.7,
'max_tokens': 100,
})
return process(inputs)
Accessing in Evaluators
Metrics and attributes are available in the EvaluatorContext:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class EfficiencyChecker(Evaluator):
max_api_calls: int = 5
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool]:
# Access metrics
api_calls = ctx.metrics.get('api_calls', 0)
tokens_used = ctx.metrics.get('tokens_used', 0)
# Access attributes
used_cache = ctx.attributes.get('used_cache', False)
return {
'efficient_api_usage': api_calls <= self.max_api_calls,
'used_caching': used_cache,
'token_efficient': tokens_used < 1000,
}
Viewing in Reports
Metrics and attributes appear in report data:
from pydantic_evals import Case, Dataset
def task(inputs: str) -> str:
return f'Result: {inputs}'
dataset = Dataset(cases=[Case(inputs='test')], evaluators=[])
report = dataset.evaluate_sync(task)
for case in report.cases:
print(f'{case.name}:')
#> Case 1:
print(f' Metrics: {case.metrics}')
#> Metrics: {}
print(f' Attributes: {case.attributes}')
#> Attributes: {}
You can also display them in printed reports:
from pydantic_evals import Case, Dataset
def task(inputs: str) -> str:
return f'Result: {inputs}'
dataset = Dataset(cases=[Case(inputs='test')], evaluators=[])
report = dataset.evaluate_sync(task)
# Metrics and attributes are available but not shown by default
# Access them programmatically or via Logfire
for case in report.cases:
print(f'\nCase: {case.name}')
"""
Case: Case 1
"""
print(f'Metrics: {case.metrics}')
#> Metrics: {}
print(f'Attributes: {case.attributes}')
#> Attributes: {}
Automatic Metrics
When using Pydantic AI and Logfire, some metrics are automatically tracked:
import logfire
from pydantic_ai import Agent
logfire.configure(send_to_logfire='if-token-present')
agent = Agent('openai:gpt-4o')
async def ai_task(inputs: str) -> str:
result = await agent.run(inputs)
return result.output
# Automatically tracked metrics:
# - requests: Number of LLM calls
# - input_tokens: Total input tokens
# - output_tokens: Total output tokens
# - prompt_tokens: Prompt tokens (if available)
# - completion_tokens: Completion tokens (if available)
# - cost: Estimated cost (if using genai-prices)
Access these in evaluators:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class CostChecker(Evaluator):
max_cost: float = 0.01 # $0.01
def evaluate(self, ctx: EvaluatorContext) -> bool:
cost = ctx.metrics.get('cost', 0.0)
return cost <= self.max_cost
Practical Examples
API Usage Tracking
from dataclasses import dataclass
from pydantic_evals import increment_eval_metric, set_eval_attribute
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
def check_cache(inputs: str) -> str | None:
return None # No cache hit for demo
@dataclass
class APIResult:
text: str
usage: 'Usage'
@dataclass
class Usage:
total_tokens: int
async def call_api(inputs: str) -> APIResult:
return APIResult(text=f'Result: {inputs}', usage=Usage(total_tokens=100))
def save_to_cache(inputs: str, result: str) -> None:
pass # Save to cache
async def smart_task(inputs: str) -> str:
# Try cache first
if cached := check_cache(inputs):
set_eval_attribute('cache_hit', True)
return cached
set_eval_attribute('cache_hit', False)
# Call API
increment_eval_metric('api_calls', 1)
result = await call_api(inputs)
increment_eval_metric('tokens', result.usage.total_tokens)
# Cache result
save_to_cache(inputs, result.text)
return result.text
# Evaluate efficiency
@dataclass
class EfficiencyEvaluator(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool | float]:
api_calls = ctx.metrics.get('api_calls', 0)
cache_hit = ctx.attributes.get('cache_hit', False)
return {
'used_cache': cache_hit,
'made_api_call': api_calls > 0,
'efficiency_score': 1.0 if cache_hit else 0.5,
}
Tool Usage Tracking
from dataclasses import dataclass
from pydantic_ai import Agent, RunContext
from pydantic_evals import increment_eval_metric, set_eval_attribute
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
agent = Agent('openai:gpt-4o')
def search(query: str) -> str:
return f'Search results for: {query}'
def call(endpoint: str) -> str:
return f'API response from: {endpoint}'
@agent.tool
def search_database(ctx: RunContext, query: str) -> str:
increment_eval_metric('db_searches', 1)
set_eval_attribute('last_query', query)
return search(query)
@agent.tool
def call_api(ctx: RunContext, endpoint: str) -> str:
increment_eval_metric('api_calls', 1)
set_eval_attribute('last_endpoint', endpoint)
return call(endpoint)
# Evaluate tool usage
@dataclass
class ToolUsageEvaluator(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool | int]:
db_searches = ctx.metrics.get('db_searches', 0)
api_calls = ctx.metrics.get('api_calls', 0)
return {
'used_database': db_searches > 0,
'used_api': api_calls > 0,
'tool_call_count': db_searches + api_calls,
'reasonable_tool_usage': (db_searches + api_calls) <= 5,
}
Performance Tracking
import time
from dataclasses import dataclass
from pydantic_evals import increment_eval_metric, set_eval_attribute
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
async def retrieve_context(inputs: str) -> list[str]:
return ['context1', 'context2']
async def generate_response(context: list[str], inputs: str) -> str:
return f'Generated response for {inputs}'
async def monitored_task(inputs: str) -> str:
# Track sub-operation timing
t0 = time.perf_counter()
context = await retrieve_context(inputs)
retrieve_time = time.perf_counter() - t0
increment_eval_metric('retrieve_time', retrieve_time)
t0 = time.perf_counter()
result = await generate_response(context, inputs)
generate_time = time.perf_counter() - t0
increment_eval_metric('generate_time', generate_time)
# Record which operations were needed
set_eval_attribute('needed_retrieval', len(context) > 0)
set_eval_attribute('context_chunks', len(context))
return result
# Evaluate performance
@dataclass
class PerformanceEvaluator(Evaluator):
max_retrieve_time: float = 0.5
max_generate_time: float = 2.0
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool]:
retrieve_time = ctx.metrics.get('retrieve_time', 0.0)
generate_time = ctx.metrics.get('generate_time', 0.0)
return {
'fast_retrieval': retrieve_time <= self.max_retrieve_time,
'fast_generation': generate_time <= self.max_generate_time,
}
Quality Tracking
from dataclasses import dataclass
from pydantic_evals import set_eval_attribute
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
async def llm_call(inputs: str) -> dict:
return {'text': f'Response: {inputs}', 'confidence': 0.85, 'sources': ['doc1', 'doc2']}
async def quality_task(inputs: str) -> str:
result = await llm_call(inputs)
# Extract quality indicators
confidence = result.get('confidence', 0.0)
sources_used = result.get('sources', [])
set_eval_attribute('confidence', confidence)
set_eval_attribute('source_count', len(sources_used))
set_eval_attribute('sources', sources_used)
return result['text']
# Evaluate based on quality signals
@dataclass
class QualityEvaluator(Evaluator):
min_confidence: float = 0.7
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool | float]:
confidence = ctx.attributes.get('confidence', 0.0)
source_count = ctx.attributes.get('source_count', 0)
return {
'high_confidence': confidence >= self.min_confidence,
'used_sources': source_count > 0,
'quality_score': confidence * (1.0 + 0.1 * source_count),
}
Metrics vs Attributes vs Metadata
Understanding the differences:
| Feature | Metrics | Attributes | Metadata |
|---|---|---|---|
| Set in | Task execution | Task execution | Case definition |
| Type | int, float | Any | Any |
| Purpose | Quantitative | Qualitative | Test data |
| Used for | Aggregation | Context | Input to task |
| Available to | Evaluators | Evaluators | Task & Evaluators |
from pydantic_evals import Case, increment_eval_metric, set_eval_attribute
# Metadata: Defined in case (before execution)
Case(
inputs='question',
metadata={'difficulty': 'hard', 'category': 'math'},
)
# Metrics & Attributes: Recorded during execution
def task(inputs):
# These are recorded during execution
increment_eval_metric('tokens', 100)
set_eval_attribute('model', 'gpt-4o')
return f'Result: {inputs}'
Troubleshooting
"Metrics/attributes not appearing"
Ensure you're calling the functions inside the task:
from pydantic_evals import increment_eval_metric
def process(inputs: str) -> str:
return f'Processed: {inputs}'
# Bad: Called outside task
increment_eval_metric('count', 1)
def bad_task(inputs):
return process(inputs)
# Good: Called inside task
def good_task(inputs):
increment_eval_metric('count', 1)
return process(inputs)
"Metrics not incrementing"
Check you're using increment_eval_metric, not set_eval_attribute:
from pydantic_evals import increment_eval_metric, set_eval_attribute
# Bad: This will overwrite, not increment
set_eval_attribute('count', 1)
set_eval_attribute('count', 1) # Still 1
# Good: This increments
increment_eval_metric('count', 1)
increment_eval_metric('count', 1) # Now 2
"Too much data in attributes"
Store summaries, not raw data:
from pydantic_evals import set_eval_attribute
giant_response_object = {'key' + str(i): 'value' * 100 for i in range(1000)}
# Bad: Huge object
set_eval_attribute('full_response', giant_response_object)
# Good: Summary
set_eval_attribute('response_size_kb', len(str(giant_response_object)) / 1024)
set_eval_attribute('response_keys', list(giant_response_object.keys())[:10]) # First 10 keys
Next Steps
- Custom Evaluators - Use metrics/attributes in evaluators
- Logfire Integration - View metrics in Logfire
- Concurrency & Performance - Optimize evaluation performance