127 lines
4.4 KiB
Go
127 lines
4.4 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
)
|
|
|
|
// telemetry.go is the request_log analytics path: it captures route, cost, latency
|
|
// and outcome for each engaged request so the real $/day and route mix can be
|
|
// MEASURED (the build plan's whole "is the cascade worth it" question) instead of
|
|
// modelled. It is strictly off the answer path — gated by TELEMETRY_ENABLED, written
|
|
// in a recovered goroutine, and a write failure only logs a WARN. A request never
|
|
// fails to be answered because telemetry couldn't be recorded.
|
|
|
|
// Route names (also the request_log.route values). grok_direct is today's path; the
|
|
// rest land behind flags in later phases. "none" means no model ran (a skip or a
|
|
// limiter denial).
|
|
const (
|
|
routeNone = "none"
|
|
routeGrokDirect = "grok_direct"
|
|
routeTrivial = "trivial_direct"
|
|
routeWebThenGrok = "web_then_grok"
|
|
routeReason = "reason_then_grok"
|
|
)
|
|
|
|
// Degrade/skip reason strings (request_log.degraded). Stable tokens so the analytics
|
|
// can GROUP BY them.
|
|
const (
|
|
degradeEncrypted = "encrypted_room"
|
|
degradeMedia = "media"
|
|
degradeForeign = "foreign_room"
|
|
degradeEmpty = "empty_completion"
|
|
degradeSendFailed = "send_failed"
|
|
degradeReserveErr = "reserve_error"
|
|
degradeRouter = "router_failed"
|
|
degradeWeb = "web_failed"
|
|
degradeTrivial = "trivial_failed"
|
|
degradeGroundCap = "grounding_cap"
|
|
degradeReasoning = "reasoning_failed"
|
|
)
|
|
|
|
// telemetryTrimEvery bounds how often the retention trim runs — once per N writes,
|
|
// off the hot path, so the analytics table stays time-bounded without a separate
|
|
// lifecycle or a DELETE on every insert.
|
|
const telemetryTrimEvery = 200
|
|
|
|
// RequestLog is one analytics row (the request_log columns). Zero values are the
|
|
// "didn't apply" case — a grok_direct request leaves the cascade fields zero.
|
|
type RequestLog struct {
|
|
ID string
|
|
RoomID string
|
|
Sender string
|
|
Route string
|
|
RouterSource string // heuristic|classifier|default|forced|degraded
|
|
RouterConfidence float64
|
|
Models map[string]string // {"router":"…","final":"…"}
|
|
|
|
PromptTokens int
|
|
CachedTokens int
|
|
CompletionTokens int
|
|
Cost CostBreakdown
|
|
|
|
LatencyMS int
|
|
StageMS map[string]int // {"router":12,"web":1400,"final":2100}
|
|
|
|
Escalated bool
|
|
FallbackFired bool
|
|
CacheHit bool
|
|
CeilingHit bool
|
|
PerUserCapHit bool
|
|
PromptVersion string
|
|
ProviderRequestID string
|
|
Degraded string
|
|
Err string
|
|
OK bool
|
|
QueryText string // stored only when TELEMETRY_STORE_TEXT; stripped otherwise
|
|
}
|
|
|
|
// recordTelemetry persists a row off the answer path. No-op unless TELEMETRY_ENABLED.
|
|
// The query text is stripped unless TELEMETRY_STORE_TEXT, so message content never
|
|
// lands in the analytics table by default. Runs in a recovered goroutine and only
|
|
// logs failures, so it can never drop or delay the reply.
|
|
func (b *Bot) recordTelemetry(ctx context.Context, rl RequestLog) {
|
|
if !b.cfg.TelemetryEnabled {
|
|
return
|
|
}
|
|
if !b.cfg.TelemetryStoreText {
|
|
rl.QueryText = ""
|
|
}
|
|
b.safego(ctx, "telemetry", func() {
|
|
if err := b.st.InsertRequestLog(rl); err != nil {
|
|
b.log.WarnContext(ctx, "request_log insert failed (non-fatal)", "id", rl.ID, "err", err)
|
|
}
|
|
b.maybeTrimTelemetry(ctx)
|
|
})
|
|
}
|
|
|
|
// recordSkip logs a request the bot was addressed by but couldn't fully serve before
|
|
// any model ran (encrypted/media/foreign). These are low-frequency, so a direct row
|
|
// (route=none + reason) keeps the "why no answer" visible without flooding the table
|
|
// with the common not-addressed drops, which are not logged (pre-claim best-effort).
|
|
func (b *Bot) recordSkip(ctx context.Context, ev *Event, reason string) {
|
|
b.recordTelemetry(ctx, RequestLog{
|
|
ID: ev.EventID,
|
|
RoomID: ev.RoomID,
|
|
Sender: ev.Sender,
|
|
Route: routeNone,
|
|
RouterSource: "default",
|
|
PromptVersion: b.promptVersion,
|
|
Degraded: reason,
|
|
OK: false,
|
|
})
|
|
}
|
|
|
|
// maybeTrimTelemetry runs the time-based retention trim once per telemetryTrimEvery
|
|
// writes. Best-effort and off the hot path (called from the telemetry goroutine).
|
|
func (b *Bot) maybeTrimTelemetry(ctx context.Context) {
|
|
if b.cfg.TelemetryRetention <= 0 {
|
|
return
|
|
}
|
|
if b.telemetryWrites.Add(1)%telemetryTrimEvery != 0 {
|
|
return
|
|
}
|
|
if err := b.st.TrimRequestLog(time.Now().Add(-b.cfg.TelemetryRetention)); err != nil {
|
|
b.log.WarnContext(ctx, "request_log trim failed (non-fatal)", "err", err)
|
|
}
|
|
}
|