Files
livekit/pkg/telemetry/telemetryservice.go
Raja Subramanian bec7453a1f Recreate stats worker on resume if needed. (#2982)
* Ref count the stats worker.

NOTE: Don't liek this much, but wanted to open this get some 👀 on
this and get feedback.

There are two entities, one for counting signal bytes and another for
media stats. They both send `ParticipantJoined` and `ParticipantLeft`
event.

In the case of a participant resume, as the old web socket
connection is closed, that triggers a signal stats counter close. That
would call `ParticipantLeft` and that would close the stats worker.

The closed stats worker got reaped in `FlushStats` after three minutes.

So, all events after that did not have a worker and hence went
unreported including missing participant_left webhook because it relied
on checking if a participant was ever connected and that needed to check
the worker state.

Using a ref count to keep track of join/leaves. And not close the worker
until ref count goes down to 0.

* create a stats worker on resume

* revert incorrect changes

* transfer connected state

* transfer connected state when creating worker

* resolve participant on a resume
2024-09-06 23:58:03 +05:30

236 lines
8.3 KiB
Go

// Copyright 2023 LiveKit, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package telemetry
import (
"context"
"sync"
"time"
"github.com/livekit/livekit-server/pkg/config"
"github.com/livekit/livekit-server/pkg/utils"
"github.com/livekit/protocol/livekit"
"github.com/livekit/protocol/logger"
"github.com/livekit/protocol/webhook"
)
//go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 . TelemetryService
type TelemetryService interface {
// TrackStats is called periodically for each track in both directions (published/subscribed)
TrackStats(key StatsKey, stat *livekit.AnalyticsStat)
// events
RoomStarted(ctx context.Context, room *livekit.Room)
RoomEnded(ctx context.Context, room *livekit.Room)
// ParticipantJoined - a participant establishes signal connection to a room
ParticipantJoined(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, clientInfo *livekit.ClientInfo, clientMeta *livekit.AnalyticsClientMeta, shouldSendEvent bool)
// ParticipantActive - a participant establishes media connection
ParticipantActive(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, clientMeta *livekit.AnalyticsClientMeta, isMigration bool)
// ParticipantResumed - there has been an ICE restart or connection resume attempt, and we've received their signal connection
ParticipantResumed(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, nodeID livekit.NodeID, reason livekit.ReconnectReason)
// ParticipantLeft - the participant leaves the room, only sent if ParticipantActive has been called before
ParticipantLeft(ctx context.Context, room *livekit.Room, participant *livekit.ParticipantInfo, shouldSendEvent bool)
// TrackPublishRequested - a publication attempt has been received
TrackPublishRequested(ctx context.Context, participantID livekit.ParticipantID, identity livekit.ParticipantIdentity, track *livekit.TrackInfo)
// TrackPublished - a publication attempt has been successful
TrackPublished(ctx context.Context, participantID livekit.ParticipantID, identity livekit.ParticipantIdentity, track *livekit.TrackInfo)
// TrackUnpublished - a participant unpublished a track
TrackUnpublished(ctx context.Context, participantID livekit.ParticipantID, identity livekit.ParticipantIdentity, track *livekit.TrackInfo, shouldSendEvent bool)
// TrackSubscribeRequested - a participant requested to subscribe to a track
TrackSubscribeRequested(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackSubscribed - a participant subscribed to a track successfully
TrackSubscribed(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo, publisher *livekit.ParticipantInfo, shouldSendEvent bool)
// TrackUnsubscribed - a participant unsubscribed from a track successfully
TrackUnsubscribed(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo, shouldSendEvent bool)
// TrackSubscribeFailed - failure to subscribe to a track
TrackSubscribeFailed(ctx context.Context, participantID livekit.ParticipantID, trackID livekit.TrackID, err error, isUserError bool)
// TrackMuted - the publisher has muted the Track
TrackMuted(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackUnmuted - the publisher has muted the Track
TrackUnmuted(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackPublishedUpdate - track metadata has been updated
TrackPublishedUpdate(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo)
// TrackMaxSubscribedVideoQuality - publisher is notified of the max quality subscribers desire
TrackMaxSubscribedVideoQuality(ctx context.Context, participantID livekit.ParticipantID, track *livekit.TrackInfo, mime string, maxQuality livekit.VideoQuality)
TrackPublishRTPStats(ctx context.Context, participantID livekit.ParticipantID, trackID livekit.TrackID, mimeType string, layer int, stats *livekit.RTPStats)
TrackSubscribeRTPStats(ctx context.Context, participantID livekit.ParticipantID, trackID livekit.TrackID, mimeType string, stats *livekit.RTPStats)
EgressStarted(ctx context.Context, info *livekit.EgressInfo)
EgressUpdated(ctx context.Context, info *livekit.EgressInfo)
EgressEnded(ctx context.Context, info *livekit.EgressInfo)
IngressCreated(ctx context.Context, info *livekit.IngressInfo)
IngressDeleted(ctx context.Context, info *livekit.IngressInfo)
IngressStarted(ctx context.Context, info *livekit.IngressInfo)
IngressUpdated(ctx context.Context, info *livekit.IngressInfo)
IngressEnded(ctx context.Context, info *livekit.IngressInfo)
LocalRoomState(ctx context.Context, info *livekit.AnalyticsNodeRooms)
// helpers
AnalyticsService
NotifyEvent(ctx context.Context, event *livekit.WebhookEvent)
FlushStats()
}
const (
workerCleanupWait = 3 * time.Minute
jobsQueueMinSize = 2048
)
type telemetryService struct {
AnalyticsService
notifier webhook.QueuedNotifier
jobsQueue *utils.OpsQueue
workersMu sync.RWMutex
workers map[livekit.ParticipantID]*StatsWorker
workerList *StatsWorker
flushMu sync.Mutex
}
func NewTelemetryService(notifier webhook.QueuedNotifier, analytics AnalyticsService) TelemetryService {
t := &telemetryService{
AnalyticsService: analytics,
notifier: notifier,
jobsQueue: utils.NewOpsQueue(utils.OpsQueueParams{
Name: "telemetry",
MinSize: jobsQueueMinSize,
FlushOnStop: true,
Logger: logger.GetLogger(),
}),
workers: make(map[livekit.ParticipantID]*StatsWorker),
}
t.jobsQueue.Start()
go t.run()
return t
}
func (t *telemetryService) FlushStats() {
t.flushMu.Lock()
defer t.flushMu.Unlock()
t.workersMu.RLock()
worker := t.workerList
t.workersMu.RUnlock()
now := time.Now()
var prev, reap *StatsWorker
for worker != nil {
next := worker.next
if closed := worker.Flush(now); closed {
if prev == nil {
// this worker was at the head of the list
t.workersMu.Lock()
p := &t.workerList
for *p != worker {
// new workers have been added. scan until we find the one
// immediately before this
prev = *p
p = &prev.next
}
*p = worker.next
t.workersMu.Unlock()
} else {
prev.next = worker.next
}
worker.next = reap
reap = worker
} else {
prev = worker
}
worker = next
}
if reap != nil {
t.workersMu.Lock()
for reap != nil {
if reap == t.workers[reap.participantID] {
delete(t.workers, reap.participantID)
}
reap = reap.next
}
t.workersMu.Unlock()
}
}
func (t *telemetryService) run() {
for range time.Tick(config.TelemetryStatsUpdateInterval) {
t.FlushStats()
}
}
func (t *telemetryService) enqueue(op func()) {
t.jobsQueue.Enqueue(op)
}
func (t *telemetryService) getWorker(participantID livekit.ParticipantID) (worker *StatsWorker, ok bool) {
t.workersMu.RLock()
defer t.workersMu.RUnlock()
worker, ok = t.workers[participantID]
return
}
func (t *telemetryService) getOrCreateWorker(
ctx context.Context,
roomID livekit.RoomID,
roomName livekit.RoomName,
participantID livekit.ParticipantID,
participantIdentity livekit.ParticipantIdentity,
transferConnectedState bool,
) (*StatsWorker, bool) {
t.workersMu.Lock()
defer t.workersMu.Unlock()
worker, ok := t.workers[participantID]
if ok && !worker.Closed() {
return worker, true
}
existingIsConnected := false
if ok && transferConnectedState {
existingIsConnected = worker.IsConnected()
}
worker = newStatsWorker(
ctx,
t,
roomID,
roomName,
participantID,
participantIdentity,
)
if existingIsConnected {
worker.SetConnected()
}
t.workers[participantID] = worker
worker.next = t.workerList
t.workerList = worker
return worker, false
}
func (t *telemetryService) LocalRoomState(ctx context.Context, info *livekit.AnalyticsNodeRooms) {
t.enqueue(func() {
t.SendNodeRoomStates(ctx, info)
})
}