Files
livekit/pkg/telemetry/telemetryservice.go
T
Raja Subramanian baf47db834 Publish data and signal bytes once every 30 seconds. (#3212)
For applications with heavy data usage, accumulating data bytes over 5
minutes and then calculating rate using a much shorter window (like 2 -
5 seconds) makes it looks like there is a massive rate spike.

While this change is not a fix, this should soften the impact.

Need a better way to handle different parts of the system operating at
different frequencies. Can use rate in the reporting window, but that
will miss the spikes. Maybe that is okay. For example, if the reporting
window is 5 minutes and there was a 100 Mbps spike for about 10 seconds
of it, it would get smoothed out.
2024-11-28 09:21:44 +05:30

239 lines
8.3 KiB
Go

// Copyright 2023 LiveKit, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package telemetry
import (
"context"
"sync"
"time"
"github.com/livekit/livekit-server/pkg/utils"
"github.com/livekit/protocol/livekit"
"github.com/livekit/protocol/logger"
"github.com/livekit/protocol/webhook"
)
//go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 -generate
//counterfeiter:generate . TelemetryService
type TelemetryService interface {
// TrackStats is called periodically for each track in both directions (published/subscribed)
TrackStats(key StatsKey, stat *livekit.AnalyticsStat)
// events
RoomStarted(ctx context.Context, room *livekit.Room)
RoomEnded(ctx context.Context, room *livekit.Room)
// ParticipantJoined - a participant establishes signal connection to a room
ParticipantJoined(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, clientInfo *livekit.ClientInfo, clientMeta *livekit.AnalyticsClientMeta, shouldSendEvent bool)
// ParticipantActive - a participant establishes media connection
ParticipantActive(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, clientMeta *livekit.AnalyticsClientMeta, isMigration bool)
// ParticipantResumed - there has been an ICE restart or connection resume attempt, and we've received their signal connection
ParticipantResumed(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, nodeID livekit.NodeID, reason livekit.ReconnectReason)
// ParticipantLeft - the participant leaves the room, only sent if ParticipantActive has been called before
ParticipantLeft(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, shouldSendEvent bool)
// TrackPublishRequested - a publication attempt has been received
TrackPublishRequested(ctx context.Context, participantID livekit.ParticipantID, identity livekit.ParticipantIdentity, track *livekit.TrackInfo)
// TrackPublished - a publication attempt has been successful
TrackPublished(ctx context.Context, participantID livekit.ParticipantID, identity livekit.ParticipantIdentity, track *livekit.TrackInfo)
// TrackUnpublished - a participant unpublished a track
TrackUnpublished(ctx context.Context, participantID livekit.ParticipantID, identity livekit.ParticipantIdentity, track *livekit.TrackInfo, shouldSendEvent bool)
// TrackSubscribeRequested - a participant requested to subscribe to a track
TrackSubscribeRequested(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackSubscribed - a participant subscribed to a track successfully
TrackSubscribed(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo, publisher *livekit.ParticipantInfo, shouldSendEvent bool)
// TrackUnsubscribed - a participant unsubscribed from a track successfully
TrackUnsubscribed(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo, shouldSendEvent bool)
// TrackSubscribeFailed - failure to subscribe to a track
TrackSubscribeFailed(ctx context.Context, participantID livekit.ParticipantID, trackID livekit.TrackID, err error, isUserError bool)
// TrackMuted - the publisher has muted the Track
TrackMuted(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackUnmuted - the publisher has muted the Track
TrackUnmuted(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackPublishedUpdate - track metadata has been updated
TrackPublishedUpdate(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackMaxSubscribedVideoQuality - publisher is notified of the max quality subscribers desire
TrackMaxSubscribedVideoQuality(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo, mime string, maxQuality livekit.VideoQuality)
TrackPublishRTPStats(ctx context.Context, participantID livekit.ParticipantID, trackID livekit.TrackID, mimeType string, layer int, stats *livekit.RTPStats)
TrackSubscribeRTPStats(ctx context.Context, participantID livekit.ParticipantID, trackID livekit.TrackID, mimeType string, stats *livekit.RTPStats)
EgressStarted(ctx context.Context, info *livekit.EgressInfo)
EgressUpdated(ctx context.Context, info *livekit.EgressInfo)
EgressEnded(ctx context.Context, info *livekit.EgressInfo)
IngressCreated(ctx context.Context, info *livekit.IngressInfo)
IngressDeleted(ctx context.Context, info *livekit.IngressInfo)
IngressStarted(ctx context.Context, info *livekit.IngressInfo)
IngressUpdated(ctx context.Context, info *livekit.IngressInfo)
IngressEnded(ctx context.Context, info *livekit.IngressInfo)
LocalRoomState(ctx context.Context, info *livekit.AnalyticsNodeRooms)
// helpers
AnalyticsService
NotifyEvent(ctx context.Context, event *livekit.WebhookEvent)
FlushStats()
}
const (
workerCleanupWait = 3 * time.Minute
jobsQueueMinSize = 2048
telemetryStatsUpdateInterval = time.Second * 30
telemetryNonMediaStatsUpdateInterval = time.Second * 30
)
type telemetryService struct {
AnalyticsService
notifier webhook.QueuedNotifier
jobsQueue *utils.OpsQueue
workersMu sync.RWMutex
workers map[livekit.ParticipantID]*StatsWorker
workerList *StatsWorker
flushMu sync.Mutex
}
func NewTelemetryService(notifier webhook.QueuedNotifier, analytics AnalyticsService) TelemetryService {
t := &telemetryService{
AnalyticsService: analytics,
notifier: notifier,
jobsQueue: utils.NewOpsQueue(utils.OpsQueueParams{
Name: "telemetry",
MinSize: jobsQueueMinSize,
FlushOnStop: true,
Logger: logger.GetLogger(),
}),
workers: make(map[livekit.ParticipantID]*StatsWorker),
}
t.jobsQueue.Start()
go t.run()
return t
}
func (t *telemetryService) FlushStats() {
t.flushMu.Lock()
defer t.flushMu.Unlock()
t.workersMu.RLock()
worker := t.workerList
t.workersMu.RUnlock()
now := time.Now()
var prev, reap *StatsWorker
for worker != nil {
next := worker.next
if closed := worker.Flush(now); closed {
if prev == nil {
// this worker was at the head of the list
t.workersMu.Lock()
p := &t.workerList
for *p != worker {
// new workers have been added. scan until we find the one
// immediately before this
prev = *p
p = &prev.next
}
*p = worker.next
t.workersMu.Unlock()
} else {
prev.next = worker.next
}
worker.next = reap
reap = worker
} else {
prev = worker
}
worker = next
}
if reap != nil {
t.workersMu.Lock()
for reap != nil {
if reap == t.workers[reap.participantID] {
delete(t.workers, reap.participantID)
}
reap = reap.next
}
t.workersMu.Unlock()
}
}
func (t *telemetryService) run() {
for range time.Tick(telemetryStatsUpdateInterval) {
t.FlushStats()
}
}
func (t *telemetryService) enqueue(op func()) {
t.jobsQueue.Enqueue(op)
}
func (t *telemetryService) getWorker(participantID livekit.ParticipantID) (worker *StatsWorker, ok bool) {
t.workersMu.RLock()
defer t.workersMu.RUnlock()
worker, ok = t.workers[participantID]
return
}
func (t *telemetryService) getOrCreateWorker(
ctx context.Context,
roomID livekit.RoomID,
roomName livekit.RoomName,
participantID livekit.ParticipantID,
participantIdentity livekit.ParticipantIdentity,
) (*StatsWorker, bool) {
t.workersMu.Lock()
defer t.workersMu.Unlock()
worker, ok := t.workers[participantID]
if ok && !worker.Closed() {
return worker, true
}
existingIsConnected := false
if ok {
existingIsConnected = worker.IsConnected()
}
worker = newStatsWorker(
ctx,
t,
roomID,
roomName,
participantID,
participantIdentity,
)
if existingIsConnected {
worker.SetConnected()
}
t.workers[participantID] = worker
worker.next = t.workerList
t.workerList = worker
return worker, false
}
func (t *telemetryService) LocalRoomState(ctx context.Context, info *livekit.AnalyticsNodeRooms) {
t.enqueue(func() {
t.SendNodeRoomStates(ctx, info)
})
}