From 2829a146d32e472fcd2885ed32cb3bbea6f86690 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 7 May 2026 10:36:25 -0500 Subject: [PATCH] Reduce `WORKER_LOCK_MAX_RETRY_INTERVAL` to 5 seconds (#19755) Better to retry more quickly than have workers wait around. 5 seconds is still a reasonable gap in time to not overwhelm anything. This matters most in cross-worker scenarios. When locks are on the same worker, when the lock holder releases, we signal to other locks (with the same name/key) that they should try reacquiring the lock immediately. But locks on other workers only re-check based on their retry `_timeout_interval`. Updating to 5 seconds to match the previous intentions based on the [flawed code](https://github.com/element-hq/synapse/blob/6100f6e4f7fb0c72f1ae2802683ebc811c0e3a77/synapse/handlers/worker_lock.py#L278). We can assume they were trying to have 5 seconds as the max value to retry. Spawning from https://github.com/element-hq/synapse/pull/19394#discussion_r3168458070 --- changelog.d/19755.misc | 1 + synapse/handlers/worker_lock.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 changelog.d/19755.misc diff --git a/changelog.d/19755.misc b/changelog.d/19755.misc new file mode 100644 index 0000000000..6ad478e531 --- /dev/null +++ b/changelog.d/19755.misc @@ -0,0 +1 @@ +Reduce `WORKER_LOCK_MAX_RETRY_INTERVAL` to 5 seconds to reduce idle time after lock is released. diff --git a/synapse/handlers/worker_lock.py b/synapse/handlers/worker_lock.py index 51be3b5084..57792ea53c 100644 --- a/synapse/handlers/worker_lock.py +++ b/synapse/handlers/worker_lock.py @@ -54,7 +54,18 @@ logger = logging.getLogger(__name__) # will not disappear under our feet as long as we don't delete the room. NEW_EVENT_DURING_PURGE_LOCK_NAME = "new_event_during_purge_lock" -WORKER_LOCK_MAX_RETRY_INTERVAL = Duration(seconds=60) +WORKER_LOCK_MAX_RETRY_INTERVAL = Duration(seconds=5) +""" +The maximum wait time before retrying to acquire the lock. + +Better to retry more quickly than have workers wait around. 5 seconds is still a +reasonable gap in time to not overwhelm the CPU/Database. + +This matters most in cross-worker scenarios. When locks are on the same worker, when the +lock holder releases, we signal to other locks (with the same name/key) that they +should try reacquiring the lock immediately. But locks on other workers only re-check +based on their retry `_timeout_interval`. +""" WORKER_LOCK_EXCESSIVE_WAITING_WARN_DURATION = Duration(minutes=10)