From b0fc0b7a612a42e6f15b87dee2a1db4c383645fb Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 31 Mar 2026 07:14:27 -0500 Subject: [PATCH] Lower the database `statement_timeout` to 10m (#19604) Lower the database `statement_timeout` to 10m Part of https://github.com/element-hq/backend-internal/issues/223 The `statement_timeout` was first introduced in https://github.com/matrix-org/synapse/pull/15853 as a great sanity check starting point (any timeout is better than no timeout). This idea is spawning from [discussions on 2026-03-16](https://docs.google.com/document/d/12RZKPk3a4__JUSH9wYHODo9rRyKzsHg6BSCAcmqmbOU/edit?tab=t.0#bookmark=id.x9y4kx82lvaj) about `matrix.org` performance and specifically room event search which frequently causes long-running transactions that hold up vacuums and causes things to go wibbly. It was brought up again today in another `matrix.org` performance [discussion on 2026-03-25](https://docs.google.com/document/d/15h4901gAzGMAol2n1b71OAcesVMNhgQbPNb55xouluw/edit?tab=t.0#bookmark=id.b53a918ntk0j) and was surprised nothing came of this yet. ### Some things may break We expect that when/if things break, to add specific overrides where it matters. For reference, we already have a couple of overrides in the codebase: - When using [`create_index_in_background`](https://github.com/element-hq/synapse/blob/40d35a95e2ce56982f839f2d5f01bdad34e65453/synapse/storage/background_updates.py#L802-L804) - When [purging rooms](https://github.com/element-hq/synapse/blob/40d35a95e2ce56982f839f2d5f01bdad34e65453/synapse/storage/databases/main/purge_events.py#L166-L169) ### Going further Ideally, we could go even further. The `statement_timeout` for any database query coming from people's HTTP requests should be 60s or less (or lower, no one wants to wait that long anyway). For now we'll take this iterative step down to 10m. Another point of reference is that for example on `matrix.org`, we even time out the whole HTTP request after 180s. For reference, we also discussed propagating request cancellation to the database query (cancelling statements when the user goes away) which is a good additional thing we could do. @sandhose looked into this before and it is only a recent libpq / Postgres / psycopg3 feature, not available in psycopg2 (which we currently use). As an example of how weird cancellation can get, `psql` will spawn a new connection to `SELECT pg_cancel_backend();` when you `Ctrl+C`, see https://neon.com/blog/ctrl-c-in-psql-gives-me-the-heebie-jeebies ### Pull Request Checklist * [x] Pull request is based on the develop branch * [x] Pull request includes a [changelog file](https://element-hq.github.io/synapse/latest/development/contributing_guide.html#changelog). The entry should: - Be a short description of your change which makes sense to users. "Fixed a bug that prevented receiving messages from other servers." instead of "Moved X method from `EventStore` to `EventWorkerStore`.". - Use markdown where necessary, mostly for `code blocks`. - End with either a period (.) or an exclamation mark (!). - Start with a capital letter. - Feel free to credit yourself, by adding a sentence "Contributed by @github_username." or "Contributed by [Your Name]." to the end of the entry. * [x] [Code style](https://element-hq.github.io/synapse/latest/code_style.html) is correct (run the [linters](https://element-hq.github.io/synapse/latest/development/contributing_guide.html#run-the-linters)) --- changelog.d/19604.misc | 1 + synapse/storage/engines/postgres.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 changelog.d/19604.misc diff --git a/changelog.d/19604.misc b/changelog.d/19604.misc new file mode 100644 index 0000000000..65081117d6 --- /dev/null +++ b/changelog.d/19604.misc @@ -0,0 +1 @@ +Lower the Postgres database `statement_timeout` to 10m (previously 1h). diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py index 03ecff27f0..7cd50fb8f1 100644 --- a/synapse/storage/engines/postgres.py +++ b/synapse/storage/engines/postgres.py @@ -31,6 +31,7 @@ from synapse.storage.engines._base import ( IsolationLevel, ) from synapse.storage.types import Cursor +from synapse.util.duration import Duration if TYPE_CHECKING: from synapse.storage.database import LoggingDatabaseConnection @@ -54,14 +55,15 @@ class PostgresEngine( psycopg2.extensions.register_adapter(bytes, _disable_bytes_adapter) self.synchronous_commit: bool = database_config.get("synchronous_commit", True) - # Set the statement timeout to 1 hour by default. - # Any query taking more than 1 hour should probably be considered a bug; + # Set the statement timeout to 10 minutes by default. + # + # Any query taking more than 10 minutes should probably be considered a bug; # most of the time this is a sign that work needs to be split up or that # some degenerate query plan has been created and the client has probably # timed out/walked off anyway. # This is in milliseconds. self.statement_timeout: int | None = database_config.get( - "statement_timeout", 60 * 60 * 1000 + "statement_timeout", Duration(minutes=10).as_millis() ) self._version: int | None = None # unknown as yet