vojo/apps/ai-bot/telemetry.go

127 lines
4.4 KiB
Go

package main
import (
"context"
"time"
)
// telemetry.go is the request_log analytics path: it captures route, cost, latency
// and outcome for each engaged request so the real $/day and route mix can be
// MEASURED (the build plan's whole "is the cascade worth it" question) instead of
// modelled. It is strictly off the answer path — gated by TELEMETRY_ENABLED, written
// in a recovered goroutine, and a write failure only logs a WARN. A request never
// fails to be answered because telemetry couldn't be recorded.
// Route names (also the request_log.route values). grok_direct is today's path; the
// rest land behind flags in later phases. "none" means no model ran (a skip or a
// limiter denial).
const (
routeNone = "none"
routeGrokDirect = "grok_direct"
routeTrivial = "trivial_direct"
routeWebThenGrok = "web_then_grok"
routeReason = "reason_then_grok"
)
// Degrade/skip reason strings (request_log.degraded). Stable tokens so the analytics
// can GROUP BY them.
const (
degradeEncrypted = "encrypted_room"
degradeMedia = "media"
degradeForeign = "foreign_room"
degradeEmpty = "empty_completion"
degradeSendFailed = "send_failed"
degradeReserveErr = "reserve_error"
degradeRouter = "router_failed"
degradeWeb = "web_failed"
degradeTrivial = "trivial_failed"
degradeGroundCap = "grounding_cap"
degradeReasoning = "reasoning_failed"
)
// telemetryTrimEvery bounds how often the retention trim runs — once per N writes,
// off the hot path, so the analytics table stays time-bounded without a separate
// lifecycle or a DELETE on every insert.
const telemetryTrimEvery = 200
// RequestLog is one analytics row (the request_log columns). Zero values are the
// "didn't apply" case — a grok_direct request leaves the cascade fields zero.
type RequestLog struct {
ID string
RoomID string
Sender string
Route string
RouterSource string // heuristic|classifier|default|forced|degraded
RouterConfidence float64
Models map[string]string // {"router":"…","final":"…"}
PromptTokens int
CachedTokens int
CompletionTokens int
Cost CostBreakdown
LatencyMS int
StageMS map[string]int // {"router":12,"web":1400,"final":2100}
Escalated bool
FallbackFired bool
CacheHit bool
CeilingHit bool
PerUserCapHit bool
PromptVersion string
ProviderRequestID string
Degraded string
Err string
OK bool
QueryText string // stored only when TELEMETRY_STORE_TEXT; stripped otherwise
}
// recordTelemetry persists a row off the answer path. No-op unless TELEMETRY_ENABLED.
// The query text is stripped unless TELEMETRY_STORE_TEXT, so message content never
// lands in the analytics table by default. Runs in a recovered goroutine and only
// logs failures, so it can never drop or delay the reply.
func (b *Bot) recordTelemetry(ctx context.Context, rl RequestLog) {
if !b.cfg.TelemetryEnabled {
return
}
if !b.cfg.TelemetryStoreText {
rl.QueryText = ""
}
b.safego(ctx, "telemetry", func() {
if err := b.st.InsertRequestLog(rl); err != nil {
b.log.WarnContext(ctx, "request_log insert failed (non-fatal)", "id", rl.ID, "err", err)
}
b.maybeTrimTelemetry(ctx)
})
}
// recordSkip logs a request the bot was addressed by but couldn't fully serve before
// any model ran (encrypted/media/foreign). These are low-frequency, so a direct row
// (route=none + reason) keeps the "why no answer" visible without flooding the table
// with the common not-addressed drops, which are not logged (pre-claim best-effort).
func (b *Bot) recordSkip(ctx context.Context, ev *Event, reason string) {
b.recordTelemetry(ctx, RequestLog{
ID: ev.EventID,
RoomID: ev.RoomID,
Sender: ev.Sender,
Route: routeNone,
RouterSource: "default",
PromptVersion: b.promptVersion,
Degraded: reason,
OK: false,
})
}
// maybeTrimTelemetry runs the time-based retention trim once per telemetryTrimEvery
// writes. Best-effort and off the hot path (called from the telemetry goroutine).
func (b *Bot) maybeTrimTelemetry(ctx context.Context) {
if b.cfg.TelemetryRetention <= 0 {
return
}
if b.telemetryWrites.Add(1)%telemetryTrimEvery != 0 {
return
}
if err := b.st.TrimRequestLog(time.Now().Add(-b.cfg.TelemetryRetention)); err != nil {
b.log.WarnContext(ctx, "request_log trim failed (non-fatal)", "err", err)
}
}