What AI Observability Adds Beyond Standard Spring Metrics
Standard Spring Boot Actuator gives you JVM, HTTP, and database metrics. AI services need three additional dimensions:
| Dimension | What to Measure | Why It Matters |
|---|---|---|
| Token economics | Input/output tokens per call, per feature, per user | LLM cost is proportional to tokens, not requests |
| AI latency | Time-to-first-token (TTFT), total completion time | TTFT is what users feel; total affects throughput |
| Provider health | Error rate by provider, circuit breaker state | Detect outages before users do; trigger failover |
Actuator + Micrometer Setup
<!-- pom.xml -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-cloudwatch2</artifactId>
</dependency>
<dependency>
<!-- For Grafana / Prometheus -->
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency># application.yml
management:
endpoints:
web:
exposure:
include: health,metrics,prometheus,info
metrics:
tags:
application: ai-service
environment: ${ENVIRONMENT:local}
distribution:
percentiles-histogram:
ai.completion.duration: true # enable histograms for p95/p99
ai.ttft.duration: true
percentiles:
"[ai.completion.duration]": 0.5,0.9,0.95,0.99Core AI Metrics: The AiMetricsRecorder
@Component
public class AiMetricsRecorder {
private final MeterRegistry registry;
// Counters
private final Counter totalRequests;
private final Counter totalErrors;
// Histograms / Timers
private final Timer completionTimer;
private final Timer ttftTimer;
// Distribution summaries for tokens
private final DistributionSummary inputTokens;
private final DistributionSummary outputTokens;
public AiMetricsRecorder(MeterRegistry registry) {
this.registry = registry;
totalRequests = Counter.builder("ai.requests.total")
.description("Total AI completion requests")
.register(registry);
totalErrors = Counter.builder("ai.errors.total")
.description("Total AI completion errors")
.register(registry);
completionTimer = Timer.builder("ai.completion.duration")
.description("End-to-end AI completion latency")
.publishPercentiles(0.5, 0.9, 0.95, 0.99)
.register(registry);
ttftTimer = Timer.builder("ai.ttft.duration")
.description("Time to first token")
.publishPercentiles(0.5, 0.9, 0.95)
.register(registry);
inputTokens = DistributionSummary.builder("ai.tokens.input")
.description("Input tokens per completion")
.register(registry);
outputTokens = DistributionSummary.builder("ai.tokens.output")
.description("Output tokens per completion")
.register(registry);
}
public void recordCompletion(AiCallResult result) {
Tags tags = Tags.of(
"provider", result.provider(),
"model", result.model(),
"feature", result.feature(),
"status", result.success() ? "success" : "error"
);
totalRequests.increment();
if (!result.success()) totalErrors.increment();
registry.timer("ai.completion.duration", tags)
.record(result.totalDurationMs(), TimeUnit.MILLISECONDS);
if (result.ttftMs() > 0) {
registry.timer("ai.ttft.duration", tags)
.record(result.ttftMs(), TimeUnit.MILLISECONDS);
}
registry.summary("ai.tokens.input", tags).record(result.inputTokens());
registry.summary("ai.tokens.output", tags).record(result.outputTokens());
// Record estimated cost as a gauge updated per call
double costUsd = calculateCost(result);
registry.counter("ai.cost.usd", tags).increment(costUsd);
}
private double calculateCost(AiCallResult result) {
return switch (result.model()) {
case "claude-sonnet-4-6" ->
result.inputTokens() / 1_000_000.0 * 3.00 // $3/MTok input
+ result.outputTokens() / 1_000_000.0 * 15.00; // $15/MTok output
case "gpt-4o" ->
result.inputTokens() / 1_000_000.0 * 2.50
+ result.outputTokens() / 1_000_000.0 * 10.00;
default -> 0.0;
};
}
}
record AiCallResult(String provider, String model, String feature,
boolean success, long totalDurationMs, long ttftMs,
int inputTokens, int outputTokens) {}Wrapping ChatClient Calls with Metrics
@Service
public class InstrumentedAiService {
private final ChatClient chatClient;
private final AiMetricsRecorder metrics;
public String complete(String feature, String prompt) {
long start = System.currentTimeMillis();
boolean success = true;
ChatResponse response = null;
try {
response = chatClient.prompt()
.user(prompt)
.call()
.chatResponse();
return response.getResult().getOutput().getContent();
} catch (Exception e) {
success = false;
throw e;
} finally {
long durationMs = System.currentTimeMillis() - start;
Usage usage = response != null
? response.getMetadata().getUsage()
: new Usage(0, 0);
metrics.recordCompletion(new AiCallResult(
"anthropic", "claude-sonnet-4-6", feature,
success, durationMs, 0,
(int) usage.getPromptTokens(),
(int) usage.getGenerationTokens()
));
}
}
}Cost Tracking Dashboard: Key Queries
These PromQL queries drive the Grafana AI cost dashboard:
# Total AI cost per hour by feature
increase(ai_cost_usd_total[1h]) by (feature)
# p95 completion latency by model
histogram_quantile(0.95,
sum(rate(ai_completion_duration_bucket[5m])) by (le, model)
)
# Error rate by provider (rolling 5m)
rate(ai_errors_total[5m]) / rate(ai_requests_total[5m])
# Average input tokens per call by feature (detect prompt bloat)
rate(ai_tokens_input_sum[5m]) / rate(ai_tokens_input_count[5m])
# Circuit breaker state (0=closed, 1=open, 2=half-open)
resilience4j_circuitbreaker_state{name="anthropic"}CloudWatch Custom Metrics (AWS)
# application.yml — CloudWatch Micrometer export
management:
cloudwatch:
metrics:
export:
enabled: true
namespace: AiService/${ENVIRONMENT}
batch-size: 20
step: 1m// CloudWatch alarm — alert if hourly AI cost exceeds $10
@Bean
public CfnAlarm aiCostAlarm(CloudFormationClient cfn) {
return CfnAlarm.Builder.create("AiCostHourlyAlarm")
.alarmName("AIService-HourlyCostExceeded")
.namespace("AiService/production")
.metricName("ai.cost.usd")
.statistic("Sum")
.period(Duration.hours(1))
.evaluationPeriods(1)
.threshold(10.0) // $10/hour threshold
.comparisonOperator(ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD)
.alarmActions(List.of(snsTopicArn))
.build();
}Health Indicators for AI Providers
@Component
public class AnthropicHealthIndicator implements HealthIndicator {
private final CircuitBreakerRegistry circuitBreakerRegistry;
private final AiMetricsRecorder metrics;
@Override
public Health health() {
CircuitBreaker cb = circuitBreakerRegistry.circuitBreaker("anthropic");
CircuitBreaker.State state = cb.getState();
return switch (state) {
case CLOSED -> Health.up()
.withDetail("circuitBreaker", "CLOSED")
.withDetail("failureRate", cb.getMetrics().getFailureRate() + "%")
.build();
case HALF_OPEN -> Health.unknown()
.withDetail("circuitBreaker", "HALF_OPEN — testing recovery")
.build();
case OPEN -> Health.down()
.withDetail("circuitBreaker", "OPEN — using fallback provider")
.withDetail("retryAfter", cb.getState())
.build();
default -> Health.unknown().build();
};
}
}- Cost overview — hourly/daily spend by model, feature, and user tier; vs. budget baseline
- Latency — p50/p95/p99 completion time and TTFT, trended over 24h; by model and provider
- Token usage — average input/output tokens per call by feature; detect prompt bloat early
- Error & circuit breaker — error rate by provider, circuit breaker state transitions, fallback frequency
- User quotas — daily token usage distribution; users approaching limits; top consumers