Files
livekit/pkg/sfu/streamallocator.go
Raja Subramanian 092789a08f Stream allocator fixes (#212)
* Stream allocator fixes

- Treat simple track like simulcast track with one layer to make
it stream allocator friendly.

* Address David's comments
2021-11-27 09:22:39 +05:30

1344 lines
42 KiB
Go

//
// Design of StreamAllocator
//
// Each participant uses one peer connection for all downstream
// traffic. It is possible that the downstream peer connection
// gets congested. In such an event, the SFU (sender on that
// peer connection) should take measures to mitigate the
// media loss and latency that would result from such a congestion.
//
// This module is supposed to aggregate down stream tracks and
// drive bandwidth allocation with the goals of
// - Try and send highest quality media
// - React as quickly as possible to mitigate congestion
//
// Setup:
// ------
// The following should be done to set up a stream allocator
// - There will be one of these per subscriber peer connection.
// Created in livekit-sever/transport.go for subscriber type
// peer connections.
// - In `AddSubscribedTrack` of livekit-server/participant.go, the created
// downTrack is added to the stream allocator.
// - In `RemoveSubscribedTrack` of livekit-server/participant.go,
// the downTrack is removed from the stream allocator.
// - Both video and audio tracks are added to this module. Although the
// stream allocator does not act on audio track forwarding, audio track
// information like loss rate may be used to adjust available bandwidth.
//
// Callbacks:
// ----------
// StreamAllocator registers the following callbacks on all registered down tracks
// - OnREMB: called when down track receives RTCP REMB. Note that REMB is a
// peer connection level aggregate metric. But, it contains all the SSRCs
// used in the calculation of that REMB. So, there could be multiple
// callbacks per RTCP REMB received (one each from down track pertaining
// to the contained SSRCs) with the same estimated channel capacity.
// - AddReceiverReportListener: called when down track received RTCP RR (Receiver Report).
// - OnAvailableLayersChanged: called when the feeding track changes its layers.
// This could happen due to publisher throttling layers due to upstream congestion
// in its path.
// - OnSubscriptionChanged: called when a down track settings are changed resulting
// from client side requests (muting/unmuting)
// - OnSubscribedLayersChanged: called when a down track settings are changed resulting
// from client side requests (limiting maximum layer).
// - OnPacketSent: called when a media packet is forwarded by the down track. As
// this happens once per forwarded packet, processing in this callback should be
// kept to a minimum.
//
// The following may be needed depending on the StreamAllocator algorithm
// - OnBitrateUpdate: called periodically to update the bit rate at which a down track
// is forwarding. This can be used to measure any overshoot and adjust allocations
// accordingly. This may have granular information like primary bitrate, retransmitted
// bitrate and padding bitrate.
//
// State machine:
// --------------
// The most critical component. It should monitor current state of channel and
// take actions to provide the best user experience by striving to achieve the
// goals outlined earlier
//
// States:
// ------
// - State_PRE_COMMIT: Before the first estimate is committed.
// Estimated channel capacity is initialized to some
// arbitrarily high value to start streaming immediately.
// Serves two purposes
// 1. Gives the bandwidth estimation algorithms data
// 2. Start streaming as soon as a user joins. Imagine
// a user joining a room with 10 participants already
// in it. That user should start receiving streams
// from everybody as soon as possible.
// - State_STABLE: When all streams are forwarded at their optimal requested layers.
// - State_DEFICIENT: When at least one stream is not able to forward optimal requested layers.
// - State_GRATUITOUS_PROBING: When all streams are forwarded at their optimal requested layers,
// but probing for extra capacity to be prepared for cases like
// new participant joining and streaming OR an existing participant
// starting a new stream like enabling camera or screen share.
//
// Signals:
// -------
// Each state should take action based on these signals and advance the state machine based
// on the result of the action.
// - Signal_ADD_TRACK: A new track has been added.
// - Signal_REMOVE_TRACK: An existing track has been removed.
// - Signal_ESTIMATE_INCREASE: Estimated channel capacity is increasing.
// - Signal_ESTIMATE_DECREASE: Estimated channel capacity is decreasing. Note that when
// channel gets congested, it is possible to get several of these
// in a very short time window.
// - Signal_RECEIVER_REPORT: An RTCP Receiver Report received from some down track.
// - Signal_AVAILABLE_LAYERS_ADD: Available layers of publisher changed, new layer(s) available.
// - Signal_AVAILABLE_LAYERS_REMOVE: Available layers of publisher changed, some previously
// available layer(s) not available anymore.
// - Signal_SUBSCRIPTION_CHANGE: Subscription changed (mute/unmute)
// - Signal_SUBSCRIBED_LAYERS_CHANGE: Subscribed layers changed (requested layers changed).
// - Signal_PERIODIC_PING: Periodic ping
//
// There are several interesting challenges which are documented in relevant code below.
//
package sfu
import (
"fmt"
"math"
"sort"
"sync"
"time"
"github.com/livekit/protocol/logger"
"github.com/pion/rtcp"
"github.com/pion/webrtc/v3"
)
const (
InitialChannelCapacity = 100 * 1000 * 1000 // 100 Mbps
EstimateEpsilon = 2000 // 2 kbps
BoostPct = 8
BoostMinBps = 20 * 1000 // 20 kbps
BoostMaxBps = 60 * 1000 // 60 kbps
GratuitousProbeHeadroomBps = 1 * 1000 * 1000 // if headroom is more than 1 Mbps, don't probe
GratuitousProbePct = 10
GratuitousProbeMaxBps = 300 * 1000 // 300 kbps
GratuitousProbeMinDurationMs = 500 * time.Millisecond
GratuitousProbeMaxDurationMs = 600 * time.Millisecond
AudioLossWeight = 0.75
VideoLossWeight = 0.25
)
type State int
const (
State_PRE_COMMIT State = iota
State_STABLE
State_DEFICIENT
State_GRATUITOUS_PROBING
)
func (s State) String() string {
switch s {
case State_PRE_COMMIT:
return "PRE_COMMIT"
case State_STABLE:
return "STABLE"
case State_DEFICIENT:
return "DEFICIENT"
case State_GRATUITOUS_PROBING:
return "GRATUITOUS_PROBING"
default:
return fmt.Sprintf("%d", int(s))
}
}
type Signal int
const (
Signal_NONE Signal = iota
Signal_ADD_TRACK
Signal_REMOVE_TRACK
Signal_ESTIMATE_INCREASE
Signal_ESTIMATE_DECREASE
Signal_RECEIVER_REPORT
Signal_AVAILABLE_LAYERS_ADD
Signal_AVAILABLE_LAYERS_REMOVE
Signal_SUBSCRIPTION_CHANGE
Signal_SUBSCRIBED_LAYERS_CHANGE
Signal_PERIODIC_PING
)
func (s Signal) String() string {
switch s {
case Signal_NONE:
return "NONE"
case Signal_ADD_TRACK:
return "ADD_TRACK"
case Signal_REMOVE_TRACK:
return "REMOVE_TRACK"
case Signal_ESTIMATE_INCREASE:
return "ESTIMATE_INCREASE"
case Signal_ESTIMATE_DECREASE:
return "ESTIMATE_DECREASE"
case Signal_RECEIVER_REPORT:
return "RECEIVER_REPORT"
case Signal_AVAILABLE_LAYERS_ADD:
return "AVAILABLE_LAYERS_ADD"
case Signal_AVAILABLE_LAYERS_REMOVE:
return "AVAILABLE_LAYERS_REMOVE"
case Signal_SUBSCRIPTION_CHANGE:
return "SUBSCRIPTION_CHANGE"
case Signal_SUBSCRIBED_LAYERS_CHANGE:
return "SUBSCRIBED_LAYERS_CHANGE"
case Signal_PERIODIC_PING:
return "PERIODIC_PING"
default:
return fmt.Sprintf("%d", int(s))
}
}
type BoostMode int
const (
BoostMode_LAYER BoostMode = iota
BoostMode_BANDWIDTH
)
func (b BoostMode) String() string {
switch b {
case BoostMode_LAYER:
return "LAYER"
case BoostMode_BANDWIDTH:
return "BANDWIDTH"
default:
return fmt.Sprintf("%d", int(b))
}
}
var (
// LK-TODO-START
// These constants will definitely require more tweaking.
// In fact, simple time tresholded rules most proably will not be enough.
// LK-TODO-END
EstimateCommitMs = 2 * 1000 * time.Millisecond // 2 seconds
ProbeWaitMs = 5 * 1000 * time.Millisecond // 5 seconds
GratuitousProbeWaitMs = 8 * 1000 * time.Millisecond // 8 seconds
BoostWaitMs = 3 * 1000 * time.Millisecond // 3 seconds
)
type StreamAllocatorParams struct {
ParticipantID string
Logger logger.Logger
}
type StreamAllocator struct {
participantID string
logger logger.Logger
onStreamedTracksChange func(paused map[string][]string, resumed map[string][]string) error
estimateMu sync.RWMutex
trackingSSRC uint32
committedChannelCapacity uint64
lastCommitTime time.Time
prevReceivedEstimate uint64
receivedEstimate uint64
lastEstimateDecreaseTime time.Time
boostMode BoostMode
boostedChannelCapacity uint64
lastBoostTime time.Time
tracksMu sync.RWMutex
tracks map[string]*Track
tracksSorted TrackSorter
prober *Prober
state State
chMu sync.RWMutex
eventCh chan []Event
runningCh chan struct{}
}
type Event struct {
Signal Signal
DownTrack *DownTrack
}
func NewStreamAllocator(params StreamAllocatorParams) *StreamAllocator {
s := &StreamAllocator{
participantID: params.ParticipantID,
logger: params.Logger,
committedChannelCapacity: InitialChannelCapacity,
lastCommitTime: time.Now(),
receivedEstimate: InitialChannelCapacity,
lastEstimateDecreaseTime: time.Now(),
boostMode: BoostMode_LAYER,
tracks: make(map[string]*Track),
prober: NewProber(ProberParams{
ParticipantID: params.ParticipantID,
Logger: params.Logger,
}),
state: State_PRE_COMMIT,
eventCh: make(chan []Event, 10),
runningCh: make(chan struct{}),
}
s.prober.OnSendProbe(s.onSendProbe)
return s
}
func (s *StreamAllocator) Start() {
go s.processEvents()
go s.ping()
}
func (s *StreamAllocator) Stop() {
s.chMu.Lock()
defer s.chMu.Unlock()
close(s.runningCh)
close(s.eventCh)
}
func (s *StreamAllocator) OnStreamedTracksChange(f func(paused map[string][]string, resumed map[string][]string) error) {
s.onStreamedTracksChange = f
}
func (s *StreamAllocator) AddTrack(downTrack *DownTrack, peerID string) {
downTrack.OnREMB(s.onREMB)
downTrack.AddReceiverReportListener(s.onReceiverReport)
downTrack.OnAvailableLayersChanged(s.onAvailableLayersChanged)
downTrack.OnSubscriptionChanged(s.onSubscriptionChanged)
downTrack.OnSubscribedLayersChanged(s.onSubscribedLayersChanged)
downTrack.OnPacketSent(s.onPacketSent)
s.tracksMu.Lock()
track := newTrack(downTrack, peerID)
s.tracks[downTrack.ID()] = track
s.tracksSorted = append(s.tracksSorted, track)
sort.Sort(s.tracksSorted)
s.tracksMu.Unlock()
s.postEvent(Event{
Signal: Signal_ADD_TRACK,
DownTrack: downTrack,
})
}
func (s *StreamAllocator) RemoveTrack(downTrack *DownTrack) {
s.tracksMu.Lock()
if _, ok := s.tracks[downTrack.ID()]; !ok {
s.tracksMu.Unlock()
return
}
delete(s.tracks, downTrack.ID())
n := len(s.tracksSorted)
for idx, track := range s.tracksSorted {
if track.DownTrack() == downTrack {
s.tracksSorted[idx] = s.tracksSorted[n-1]
s.tracksSorted = s.tracksSorted[:n-1]
break
}
}
sort.Sort(s.tracksSorted)
s.tracksMu.Unlock()
s.postEvent(Event{
Signal: Signal_REMOVE_TRACK,
DownTrack: downTrack,
})
}
func (s *StreamAllocator) onREMB(downTrack *DownTrack, remb *rtcp.ReceiverEstimatedMaximumBitrate) {
// the channel capacity is estimated at a peer connection level. All down tracks
// in the peer connection will end up calling this for a REMB report with
// the same estimated channel capacity. Use a tracking SSRC to lock onto to
// one report. As SSRCs can be dropped over time, update tracking SSRC as needed
//
// A couple of things to keep in mind
// - REMB reports could be sent gratuitously as a way of providing
// periodic feedback, i. e. even if the estimated capacity does not
// change, there could be REMB packets on the wire. Those gratuitous
// REMBs should not trigger anything bad.
// - As each down track will issue this callback for the same REMB packet
// from the wire, theoretically it is possible that one down track's
// callback from previous REMB comes after another down track's callback
// from the new REMB. REMBs could fire very quickly especially when
// the network is entering congestion.
// LK-TODO-START
// Need to check if the same SSRC reports can somehow race, i.e. does pion send
// RTCP dispatch for same SSRC on different threads? If not, the tracking SSRC
// should prevent racing
// LK-TODO-END
s.estimateMu.Lock()
found := false
for _, ssrc := range remb.SSRCs {
if ssrc == s.trackingSSRC {
found = true
break
}
}
if !found {
if len(remb.SSRCs) == 0 {
s.estimateMu.Unlock()
s.logger.Warnw("no SSRC to track REMB", nil, "participant", s.participantID)
return
}
// try to lock to track which is sending this update
for _, ssrc := range remb.SSRCs {
if ssrc == downTrack.SSRC() {
s.trackingSSRC = downTrack.SSRC()
found = true
break
}
}
if !found {
s.trackingSSRC = remb.SSRCs[0]
}
}
if s.trackingSSRC != downTrack.SSRC() {
s.estimateMu.Unlock()
return
}
s.prevReceivedEstimate = s.receivedEstimate
s.receivedEstimate = uint64(remb.Bitrate)
if s.prevReceivedEstimate != s.receivedEstimate {
s.logger.Debugw("received new estimate", "participant", s.participantID, "old(bps)", s.prevReceivedEstimate, "new(bps)", s.receivedEstimate)
}
signal := s.maybeCommitEstimate()
s.estimateMu.Unlock()
if signal != Signal_NONE {
s.postEvent(Event{
Signal: signal,
})
}
}
// LK-TODO-START
// Receiver report stats are not used in the current implementation.
//
// The idea is to use a loss/rtt based estimator and compare against REMB like outlined here
// https://datatracker.ietf.org/doc/html/draft-ietf-rmcat-gcc-02#section-6
//
// But the implementation could get quite tricky. So, a separate PR dedicated effort for that
// is required. Something like from Chrome, but hopefully much less complicated :-)
// https://source.chromium.org/chromium/chromium/src/+/main:third_party/webrtc/modules/congestion_controller/goog_cc/loss_based_bandwidth_estimation.cc;bpv=0;bpt=1
// LK-TODO-END
func (s *StreamAllocator) onReceiverReport(downTrack *DownTrack, rr *rtcp.ReceiverReport) {
s.tracksMu.RLock()
defer s.tracksMu.RUnlock()
if track, ok := s.tracks[downTrack.ID()]; ok {
track.UpdatePacketStats(rr)
}
}
// called when feeding track's simulcast layer availability changes
func (s *StreamAllocator) onAvailableLayersChanged(downTrack *DownTrack, layerAdded bool) {
// LK-TODO: Look at processing specific downtrack
signal := Signal_AVAILABLE_LAYERS_REMOVE
if layerAdded {
signal = Signal_AVAILABLE_LAYERS_ADD
}
s.postEvent(Event{
Signal: signal,
DownTrack: downTrack,
})
}
// called when subscription settings changes (muting/unmuting of track)
func (s *StreamAllocator) onSubscriptionChanged(downTrack *DownTrack) {
// LK-TODO: Look at processing specific downtrack
s.postEvent(Event{
Signal: Signal_SUBSCRIPTION_CHANGE,
DownTrack: downTrack,
})
}
// called when subscribed layers changes (limiting max layers)
func (s *StreamAllocator) onSubscribedLayersChanged(downTrack *DownTrack, maxSpatialLayer int32, maxTemporalLayer int32) {
// LK-TODO: Look at processing specific downtrack for reallocation
s.tracksMu.Lock()
track, ok := s.tracks[downTrack.ID()]
if !ok {
s.tracksMu.Unlock()
return
}
track.UpdateMaxLayers(maxSpatialLayer, maxTemporalLayer)
sort.Sort(s.tracksSorted)
s.tracksMu.Unlock()
s.postEvent(Event{
Signal: Signal_SUBSCRIBED_LAYERS_CHANGE,
DownTrack: downTrack,
})
}
// called when DownTrack sends a packet
func (s *StreamAllocator) onPacketSent(downTrack *DownTrack, size int) {
// bandwidth allocation is limited to video, so ignore audio
if downTrack.Kind() == webrtc.RTPCodecTypeAudio {
return
}
s.prober.PacketSent(size)
}
// called when prober wants to send packets
func (s *StreamAllocator) onSendProbe(bytesToSend int) int {
if bytesToSend <= 0 {
return 0
}
s.tracksMu.RLock()
defer s.tracksMu.RUnlock()
bytesSent := 0
for _, track := range s.tracks {
sent := track.WritePaddingRTP(bytesToSend)
bytesSent += sent
bytesToSend -= sent
if bytesToSend <= 0 {
break
}
}
return bytesSent
}
func (s *StreamAllocator) postEvent(event Event) {
s.chMu.RLock()
defer s.chMu.RUnlock()
if !s.isRunning() {
return
}
s.eventCh <- []Event{event}
}
func (s *StreamAllocator) processEvents() {
for events := range s.eventCh {
if events == nil {
return
}
for _, event := range events {
s.runStateMachine(event)
}
}
}
func (s *StreamAllocator) isRunning() bool {
select {
case <-s.runningCh:
return false
default:
return true
}
}
func (s *StreamAllocator) ping() {
ticker := time.NewTicker(time.Second)
for s.isRunning() {
<-ticker.C
if !s.isRunning() {
return
}
s.estimateMu.Lock()
signal := s.maybeCommitEstimate()
s.estimateMu.Unlock()
if signal != Signal_NONE {
s.postEvent(Event{
Signal: signal,
})
}
s.postEvent(Event{
Signal: Signal_PERIODIC_PING,
})
}
}
// LK-TODO-START
// Typically, in a system like this, there are track priorities.
// It is either implemented as policy
// Examples:
// 1. active speaker gets hi-res, all else lo-res
// 2. screen share streams get hi-res, all else lo-res
// OR it is left up to the clients to subscribe explicitly to the quality they want.
//
// When such a policy is implemented, some of the state machine behaviour needs
// to be changed. For example, in State_DEFICIENT when Signal_ADD_TRACK, it does not
// allocate immediately (it allocates only if enough time has passed since last
// estimate decrease or since the last artificial estimate boost). But, if there is
// a policy and the added track falls in the high priority bucket, an allocation
// would be required even in State_DEFICIENT to ensure higher priority streams are
// forwarded without delay.
// LK-TODO-END
// LK-TODO-START
// A better implementation would be to handle each signal as for a lot of signals,
// all the states do the same thing. But, it is written the following way for
// better readability. Will do the refactor once this code is more stable and
// more people have a chance to get familiar with it.
// LK-TODO-END
func (s *StreamAllocator) runStateMachine(event Event) {
switch s.state {
case State_PRE_COMMIT:
s.runStatePreCommit(event)
case State_STABLE:
s.runStateStable(event)
case State_DEFICIENT:
s.runStateDeficient(event)
case State_GRATUITOUS_PROBING:
s.runStateGratuitousProbing(event)
}
}
// LK-TODO: ADD_TRACK should not reallocate if added track is audio
func (s *StreamAllocator) runStatePreCommit(event Event) {
switch event.Signal {
case Signal_ADD_TRACK:
s.allocate()
case Signal_REMOVE_TRACK:
s.allocate()
case Signal_ESTIMATE_INCREASE:
// should never happen as the intialized capacity is very high
s.setState(State_STABLE)
case Signal_ESTIMATE_DECREASE:
// first estimate could be off as things are ramping up.
//
// Basically, an initial channel capacity of InitialChannelCapacity
// which is very high is used so that there is no throttling before
// getting an estimate. The first estimate happens 6 - 8 seconds
// after streaming starts. With screen share like stream, the traffic
// is sparse/spikey depending on the content. So, the estimate could
// be small and still ramping up. So, doing an allocation could result
// in stream being paused.
//
// This is one of the significant challenges here, i. e. time alignment.
// Estimate was potentially measured using data from a little while back.
// But, that estimate is used to allocate based on current bitrate.
// So, in the intervening period if there was movement on the screen,
// the bitrate could have spiked and the REMB value is probably stale.
//
// A few options to consider
// o what is implemented here (i. e. change to STABLE state and
// let further streaming drive the esimation and move the state
// machine in whichever direction it needs to go)
// o Forward padding packets also. But, that could have an adverse
// effect on the channel when a new stream comes on line and
// starts probing, i. e. it could affect existing flows if it
// congests the channel because of the spurt of padding packets.
// o Do probing downstream in the first few seconds in addition to
// forwarding any streams and get a better estimate of the channel.
// o Measure up stream bitrates over a much longer window to smooth
// out spikes and get a more steady-state rate and use it in
// allocations.
//
// A good challenge. Basically, the goal should be to aviod re-allocation
// of streams. Any re-allocation means potential for estimate and current
// state of up streams not being in sync because of time differences resulting
// in streams getting paused unnecessarily.
s.setState(State_STABLE)
case Signal_RECEIVER_REPORT:
case Signal_AVAILABLE_LAYERS_ADD:
s.allocate()
case Signal_AVAILABLE_LAYERS_REMOVE:
s.allocate()
case Signal_SUBSCRIPTION_CHANGE:
s.allocate()
case Signal_SUBSCRIBED_LAYERS_CHANGE:
s.allocate()
case Signal_PERIODIC_PING:
}
}
func (s *StreamAllocator) runStateStable(event Event) {
switch event.Signal {
case Signal_ADD_TRACK:
s.allocate()
case Signal_REMOVE_TRACK:
// LK-TODO - may want to re-calculate channel usage?
case Signal_ESTIMATE_INCREASE:
// streaming optimally, no need to do anything
case Signal_ESTIMATE_DECREASE:
s.allocate()
case Signal_RECEIVER_REPORT:
case Signal_AVAILABLE_LAYERS_ADD:
s.allocate()
case Signal_AVAILABLE_LAYERS_REMOVE:
s.allocate()
case Signal_SUBSCRIPTION_CHANGE:
s.allocate()
case Signal_SUBSCRIBED_LAYERS_CHANGE:
s.allocate()
case Signal_PERIODIC_PING:
// if bandwidth estimate has been stable for a while, maybe gratuitously probe
if s.maybeGratuitousProbe() {
s.setState(State_GRATUITOUS_PROBING)
}
}
}
// LK-TODO-START
// The current implementation tries to probe using media if the allocation is not optimal.
//
// But, another option to try is using padding only to probe even when deficient.
// In the current impl, starting a new stream or moving stream to a new layer might end up
// affecting more streams, i.e. in the sense that we started a new stream and user sees
// that affecting all the streams. Using padding only means a couple of things
// 1. If padding packets get lost, no issue
// 2. From a user perspective, it will appear like a network glitch and not due to
// starting a new stream. This is not desirable (i. e. just because user does not see
// it as a direct effect of starting a stream does not mean it is preferred).
// But, something to keep in mind in terms of user perception.
// LK-TODO-END
func (s *StreamAllocator) runStateDeficient(event Event) {
switch event.Signal {
case Signal_ADD_TRACK:
s.allocate()
case Signal_REMOVE_TRACK:
s.allocate()
case Signal_ESTIMATE_INCREASE:
// as long as estimate is increasing, keep going.
// try an allocation to check if state can move to STABLE
s.allocate()
case Signal_ESTIMATE_DECREASE:
// stop using the boosted estimate
s.resetBoost()
s.allocate()
case Signal_RECEIVER_REPORT:
case Signal_AVAILABLE_LAYERS_ADD:
// new track coming online will trigger this.
// LK-TODO: probe using the new track rather than trying with existing tracks.
s.maybeProbe()
case Signal_AVAILABLE_LAYERS_REMOVE:
s.allocate()
case Signal_SUBSCRIPTION_CHANGE:
s.allocate()
case Signal_SUBSCRIBED_LAYERS_CHANGE:
s.allocate()
case Signal_PERIODIC_PING:
s.maybeProbe()
}
}
func (s *StreamAllocator) runStateGratuitousProbing(event Event) {
// for anything that needs a run of allocation, stop the prober as the traffic
// shape will be altered after an allocation. Although prober will take into
// account regular traffic, conservatively stop the prober before an allocation
// to avoid any self-inflicted damaage
switch event.Signal {
case Signal_ADD_TRACK:
s.allocate()
case Signal_REMOVE_TRACK:
// LK-TODO - may want to re-calculate channel usage?
case Signal_ESTIMATE_INCREASE:
// good, got a better estimate. Prober may or may not have finished.
// Let it continue if it is still running.
case Signal_ESTIMATE_DECREASE:
// stop gratuitous probing immediately and allocate
s.prober.Reset()
s.allocate()
case Signal_RECEIVER_REPORT:
case Signal_AVAILABLE_LAYERS_ADD:
s.prober.Reset()
s.allocate()
case Signal_AVAILABLE_LAYERS_REMOVE:
s.prober.Reset()
s.allocate()
case Signal_SUBSCRIPTION_CHANGE:
s.prober.Reset()
s.allocate()
case Signal_SUBSCRIBED_LAYERS_CHANGE:
s.prober.Reset()
s.allocate()
case Signal_PERIODIC_PING:
// try for more
if !s.prober.IsRunning() {
if s.maybeGratuitousProbe() {
s.logger.Debugw("trying more gratuitous probing", "participant", s.participantID)
} else {
s.setState(State_STABLE)
}
}
}
}
func (s *StreamAllocator) setState(state State) {
if s.state != state {
s.logger.Infow("state change", "participant", s.participantID, "from", s.state.String(), "to", state.String())
}
s.state = state
}
func (s *StreamAllocator) maybeCommitEstimate() Signal {
// commit channel capacity estimate under following rules
// 1. Abs(receivedEstimate - prevReceivedEstimate) < EstimateEpsilon => estimate stable
// 2. time.Since(lastCommitTime) > EstimateCommitMs => to catch long oscillating estimate
if math.Abs(float64(s.receivedEstimate)-float64(s.prevReceivedEstimate)) > EstimateEpsilon {
// too large a change, wait for estimate to settle.
// Unless estimate has been oscillating for too long.
if time.Since(s.lastCommitTime) < EstimateCommitMs {
return Signal_NONE
}
}
// don't commit too often even if the change is small.
// Small changes will also get picked up during periodic check.
if time.Since(s.lastCommitTime) < EstimateCommitMs {
return Signal_NONE
}
if s.receivedEstimate == s.committedChannelCapacity {
// no change in estimate, no need to commit
return Signal_NONE
}
signal := Signal_ESTIMATE_INCREASE
if s.receivedEstimate < s.committedChannelCapacity {
signal = Signal_ESTIMATE_DECREASE
s.lastEstimateDecreaseTime = time.Now()
}
s.committedChannelCapacity = s.receivedEstimate
s.lastCommitTime = time.Now()
s.logger.Debugw("committing channel capacity", "participant", s.participantID, "capacity(bps)", s.committedChannelCapacity)
return signal
}
func (s *StreamAllocator) getChannelCapacity() (uint64, uint64) {
s.estimateMu.RLock()
defer s.estimateMu.RUnlock()
return s.committedChannelCapacity, s.receivedEstimate
}
func (s *StreamAllocator) allocate() {
// LK-TODO-START
// Introduce some rules for allocate. Some thing like
// - When estimate decreases, immediately.
// Maybe have some threshold for decrease case also before triggering.
// o 5% decrease OR 200 kbps absolute decrease
// - When estimate increases
// o 10% increase - conservative in pushing more data
// o even if 10% increase, do it only once every 10/15/30 seconds
// When estimate goes up/down, there could be multiple updates. The challenge
// is to react quickly, but not too quickly.
//
// Some of the challenges here are
// - Audio packet loss in subscriber PC should be considered.
// If audio loss is too high, throttle video aggressively as
// audio quality trumps anything else in a conferencing application.
// Note that bandwidth estimation algorithms themselves
// might adjust for it and report estimated capacity.
// - Video packet loss should be taken into consideration too.
// - Especially tricky is video start/stop (either track start/stop
// or Simulcast spatial layer switching (temporal layer switching
// is fine)). That requires a key frame which is usually 10X
// the size of a P-frame. So when channel capacity goes down
// switching to a lower spatial layer could cause a temporary
// spike in bitrate exacerbating the already congested channel
// condition. This is a reason to use a Pacer in the path to
// smooth out spikes. But, a Pacer introduces significant
// overhead both in terms of memory (outgoing packets need to
// be queued) and CPU (a high frequency polling thread to drain
// the queued packets on to the wire at predictable rate)
// - Video retranmission rate should be taken into account while
// allocating to check which layer of publisher will
// fit in available bandwidth.
// - Increasing channel capacity is a tricky one. Some times,
// the bandwidth estimators will not report an increase unless
// the channel is probed with more traffic. So, may have to
// trigger an allocation if the channel is stable for a while
// and send some extra streams. Another option is to just
// send RTP padding only packets to probe the channel which
// can be done on an existing stream without re-enabling a
// stream.
// - There is also the issue of time synchronization. This makes
// debugging/simulating scenarios difficult. Basically, there
// are various kinds of delays in the system. So, when something
// really happened and when we are really responding is always
// going to be offset. So, need to be cognizant of that and
// apply necessary corrections whenever possible. For example
// o Bandwidth estimation itself takes time
// o RTCP packets could be lost
// o RTCP Receiver Report loss could have happened a while back.
// As RTCP RR usually reported once a second or so, if there
// is loss, there is no indication if that loss happened at
// the beginnning of the window or not.
// o When layer switching, there are more round trips needed.
// A PLI has to go to the publisher and publisher has to
// generate a key frame. Another very important event
// (generation of a key frame) happens potentially 100s of ms
// after we asked for it.
// In general, just need to be aware of these and tweak allocation
// to not cause oscillations.
// LK-TODO-END
//
// LK-TODO-START
// Calculate the aggregate loss. This may or may not
// be necessary depending on the algorithm we choose. In this
// pass, we could also calculate audio & video track loss
// separately and use different rules.
//
// The loss calculation should be for the window between last
// allocation and now. The `lastPackets*` field in
// `Track` structure is used to cache the packet stats
// at the last allocation. Potentially need to think about
// giving higher weight to recent losses. So, might have
// to update the `lastPackets*` periodically even when
// there is no allocation for a long time to ensure loss calculation
// remains fresh.
// LK-TODO-END
//
s.tracksMu.RLock()
//
// Ask down tracks adjust their forwarded layers.
// It is possible that tracks might all fit under boosted bandwidth scenarios.
// So, track total requested bandwidth and mark DEFICIENT state if the total is
// above the estimated channel capacity even if the optimal signal is true.
//
// LK-TODO make protocol friendly structures
var pausedTracks map[string][]string
var resumedTracks map[string][]string
isOptimal := true
totalBandwidthRequested := uint64(0)
committedChannelCapacity, _ := s.getChannelCapacity()
availableChannelCapacity := committedChannelCapacity
if availableChannelCapacity < s.boostedChannelCapacity {
availableChannelCapacity = s.boostedChannelCapacity
}
for _, track := range s.tracksSorted {
//
// `audio` tracks will do nothing in this method.
//
// `video` tracks could do one of the following
// - no change, i. e. currently forwarding optimal available
// layer and there is enough bandwidth for that.
// - adjust layers up or down
// - mute if there is not enough capacity for any layer
// NOTE: When video switches layers, on layer switch up,
// the current layer can keep forwarding to ensure smooth
// video at the client. As layer up usually means there is
// enough bandwidth, the lower layer can keep streaming till
// the switch point for higher layer becomes available.
// But, in the other direction, higher layer forwarding should
// be stopped immediately to not further congest the channel.
//
//
isPausing, isResuming, bandwidthRequested, optimalBandwidthNeeded := track.AdjustAllocation(availableChannelCapacity)
totalBandwidthRequested += bandwidthRequested
if optimalBandwidthNeeded > 0 && bandwidthRequested < optimalBandwidthNeeded {
//
// Assuming this is a prioritized list of tracks
// and we are walking down in that priority order.
// Once one of those streams do not fit, set
// the availableChannelCapacity to 0 so that no
// other lower priority stream gets forwarded.
// Note that a lower priority stream may have
// a layer which might fit in the left over
// capacity. This is one type of policy
// implementation. There may be other policies
// which might allow lower priority to go through too.
// So, we need some sort of policy framework here
// to decide which streams get priority
//
availableChannelCapacity = 0
isOptimal = false
} else {
availableChannelCapacity -= bandwidthRequested
}
if isPausing {
appendTrack(track, &pausedTracks)
}
if isResuming {
appendTrack(track, &resumedTracks)
}
}
s.tracksMu.RUnlock()
if !isOptimal || totalBandwidthRequested > committedChannelCapacity {
s.setState(State_DEFICIENT)
} else {
if committedChannelCapacity != InitialChannelCapacity {
s.resetBoost()
s.setState(State_STABLE)
}
}
if len(pausedTracks) != 0 || len(resumedTracks) != 0 {
s.logger.Debugw("streamed tracks changed", "participant", s.participantID, "paused", pausedTracks, "resumed", resumedTracks)
if s.onStreamedTracksChange != nil {
err := s.onStreamedTracksChange(pausedTracks, resumedTracks)
if err != nil {
s.logger.Errorw("could not send streamed tracks update", err, "participant", s.participantID)
}
}
}
//
// The above loop may become a concern. In a typical conference
// kind of scenario, there are probably not that many people, so
// the number of down tracks will be limited.
//
// But, can imagine a case of roomless having a single peer
// connection between RTC node and a relay where all the streams
// (even spanning multiple rooms) are on a single peer connection.
// In that case, I think this should mostly be disabled, i. e.
// that peer connection should be looked at as RTC node's publisher
// peer connection and any throttling mechanisms should be disabled.
//
}
func (s *StreamAllocator) getExpectedBandwidthUsage() uint64 {
s.tracksMu.RLock()
defer s.tracksMu.RUnlock()
expected := uint64(0)
for _, track := range s.tracks {
expected += track.BandwidthRequested()
}
return expected
}
func (s *StreamAllocator) getOptimalBandwidthUsage() uint64 {
s.tracksMu.RLock()
defer s.tracksMu.RUnlock()
optimal := uint64(0)
for _, track := range s.tracks {
optimal += track.BandwidthOptimal()
}
return optimal
}
// LK-TODO: unused till loss based estimation is done, but just a sample impl of weighting audio higher
func (s *StreamAllocator) calculateLoss() float32 {
s.tracksMu.RLock()
defer s.tracksMu.RUnlock()
packetsAudio := uint32(0)
packetsLostAudio := uint32(0)
packetsVideo := uint32(0)
packetsLostVideo := uint32(0)
for _, track := range s.tracks {
kind, packets, packetsLost := track.GetPacketStats()
if kind == webrtc.RTPCodecTypeAudio {
packetsAudio += packets
packetsLostAudio += packetsLost
}
if kind == webrtc.RTPCodecTypeVideo {
packetsVideo += packets
packetsLostVideo += packetsLost
}
}
audioLossPct := float32(0.0)
if packetsAudio != 0 {
audioLossPct = (float32(packetsLostAudio) * 100.0) / float32(packetsAudio)
}
videoLossPct := float32(0.0)
if packetsVideo != 0 {
videoLossPct = (float32(packetsLostVideo) * 100.0) / float32(packetsVideo)
}
return AudioLossWeight*audioLossPct + VideoLossWeight*videoLossPct
}
func (s *StreamAllocator) maybeProbe() {
if !s.isTimeToBoost() {
return
}
switch s.boostMode {
case BoostMode_LAYER:
s.maybeBoostLayer()
case BoostMode_BANDWIDTH:
s.maybeBoostBandwidth()
}
}
func (s *StreamAllocator) maybeBoostLayer() {
s.tracksMu.RLock()
for _, track := range s.tracks {
boosted, additionalBandwidth := track.IncreaseAllocation()
if boosted {
if s.boostedChannelCapacity > s.committedChannelCapacity {
s.boostedChannelCapacity += additionalBandwidth
} else {
s.boostedChannelCapacity = s.committedChannelCapacity + additionalBandwidth
}
s.lastBoostTime = time.Now()
break
}
}
s.tracksMu.RUnlock()
}
func (s *StreamAllocator) maybeBoostBandwidth() {
// temporarily boost estimate for probing.
// Boost either the committed channel capacity or previous boost point if there is one
baseBps, _ := s.getChannelCapacity()
if baseBps < s.boostedChannelCapacity {
baseBps = s.boostedChannelCapacity
}
boostBps := (baseBps * BoostPct) / 100
if boostBps < BoostMinBps {
boostBps = BoostMinBps
}
if boostBps > BoostMaxBps {
boostBps = BoostMaxBps
}
s.boostedChannelCapacity = baseBps + boostBps
s.lastBoostTime = time.Now()
s.allocate()
}
func (s *StreamAllocator) isTimeToBoost() bool {
// if enough time has passed since last esitmate drop or last estimate boost,
// artificially boost estimate before allocating.
// Checking against last estimate boost prevents multiple artificial boosts
// in situations where multiple tracks become available in a short span.
if !s.lastBoostTime.IsZero() {
return time.Since(s.lastBoostTime) > BoostWaitMs
} else {
return time.Since(s.lastEstimateDecreaseTime) > ProbeWaitMs
}
}
func (s *StreamAllocator) resetBoost() {
s.lastBoostTime = time.Unix(0, 0)
s.boostedChannelCapacity = 0
}
func (s *StreamAllocator) maybeGratuitousProbe() bool {
// LK-TODO: do not probe if there are no video tracks
if time.Since(s.lastEstimateDecreaseTime) < GratuitousProbeWaitMs {
return false
}
// use last received estimate for gratuitous probing base as
// more updates may have been received since the last commit
_, receivedEstimate := s.getChannelCapacity()
expectedRateBps := s.getExpectedBandwidthUsage()
headroomBps := receivedEstimate - expectedRateBps
if headroomBps > GratuitousProbeHeadroomBps {
return false
}
probeRateBps := (receivedEstimate * GratuitousProbePct) / 100
if probeRateBps > GratuitousProbeMaxBps {
probeRateBps = GratuitousProbeMaxBps
}
s.prober.AddCluster(
int(receivedEstimate+probeRateBps),
int(expectedRateBps),
GratuitousProbeMinDurationMs,
GratuitousProbeMaxDurationMs,
)
return true
}
//------------------------------------------------
func appendTrack(t *Track, m *map[string][]string) {
if *m == nil {
*m = make(map[string][]string)
}
peerID := t.PeerID()
trackID := t.ID()
peer, ok := (*m)[peerID]
if !ok {
peer = make([]string, 0, 2)
}
peer = append(peer, trackID)
(*m)[peerID] = peer
}
//------------------------------------------------
type Track struct {
// LK-TODO-START
// Check if we can do without a lock?
//
// Packet stats are updated in a different thread.
// Maybe a specific lock for that?
// There may be more in the future though.
// LK-TODO-END
lock sync.RWMutex
downTrack *DownTrack
peerID string
highestSN uint32
packetsLost uint32
lastHighestSN uint32
lastPacketsLost uint32
bandwidthRequested uint64
optimalBandwidthNeeded uint64
maxSpatialLayer int32
maxTemporalLayer int32
}
func newTrack(downTrack *DownTrack, peerID string) *Track {
return &Track{
downTrack: downTrack,
peerID: peerID,
maxSpatialLayer: downTrack.MaxSpatialLayer(),
maxTemporalLayer: downTrack.MaxTemporalLayer(),
}
}
func (t *Track) DownTrack() *DownTrack {
return t.downTrack
}
func (t *Track) ID() string {
return t.downTrack.ID()
}
func (t *Track) PeerID() string {
return t.peerID
}
// LK-TODO this should probably be maintained in downTrack and this module can query what it needs
func (t *Track) UpdatePacketStats(rr *rtcp.ReceiverReport) {
t.lock.Lock()
defer t.lock.Unlock()
t.lastHighestSN = t.highestSN
t.lastPacketsLost = t.packetsLost
for _, report := range rr.Reports {
if report.LastSequenceNumber > t.highestSN {
t.highestSN = report.LastSequenceNumber
}
if report.TotalLost > t.packetsLost {
t.packetsLost = report.TotalLost
}
}
}
func (t *Track) UpdateMaxLayers(maxSpatialLayer, maxTemporalLayer int32) {
t.maxSpatialLayer = maxSpatialLayer
t.maxTemporalLayer = maxTemporalLayer
}
func (t *Track) GetPacketStats() (webrtc.RTPCodecType, uint32, uint32) {
t.lock.RLock()
defer t.lock.RUnlock()
return t.downTrack.Kind(), t.highestSN - t.lastHighestSN, t.packetsLost - t.lastPacketsLost
}
func (t *Track) WritePaddingRTP(bytesToSend int) int {
return t.downTrack.WritePaddingRTP(bytesToSend)
}
func (t *Track) AdjustAllocation(availableChannelCapacity uint64) (bool, bool, uint64, uint64) {
isPausing, isResuming, bandwidthRequested, optimalBandwidthNeeded := t.downTrack.AdjustAllocation(availableChannelCapacity)
t.bandwidthRequested = bandwidthRequested
t.optimalBandwidthNeeded = optimalBandwidthNeeded
return isPausing, isResuming, bandwidthRequested, optimalBandwidthNeeded
}
func (t *Track) IncreaseAllocation() (bool, uint64) {
increased, bandwidthRequested, optimalBandwidthNeeded := t.downTrack.IncreaseAllocation()
additionalBandwidth := 0
if increased {
additionalBandwidth = int(bandwidthRequested) - int(t.bandwidthRequested)
if additionalBandwidth < 0 {
additionalBandwidth = 0
}
t.bandwidthRequested = bandwidthRequested
t.optimalBandwidthNeeded = optimalBandwidthNeeded
}
return increased, uint64(additionalBandwidth)
}
func (t *Track) BandwidthRequested() uint64 {
return t.bandwidthRequested
}
func (t *Track) BandwidthOptimal() uint64 {
return t.optimalBandwidthNeeded
}
//------------------------------------------------
type TrackSorter []*Track
func (t TrackSorter) Len() int {
return len(t)
}
func (t TrackSorter) Swap(i, j int) {
t[i], t[j] = t[j], t[i]
}
func (t TrackSorter) Less(i, j int) bool {
// highest spatial layers have higher priority
if t[i].maxSpatialLayer != t[j].maxSpatialLayer {
return t[i].maxSpatialLayer > t[j].maxSpatialLayer
}
// highest temporal layers have priority if max spatial layers match
if t[i].maxTemporalLayer != t[j].maxTemporalLayer {
return t[i].maxTemporalLayer > t[j].maxTemporalLayer
}
// use track id to keep ordering if nothing else changes
// LK-TODO: ideally should be sorting, compare and then re-allocate only if order changed
return t[i].ID() < t[j].ID()
}
//------------------------------------------------