From ff034c1f623bc769f1f3bca801592dd42697ea66 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 10 Jun 2026 17:33:21 +0100 Subject: [PATCH] Fix flakey test failing with "twisted.protocols.amp.TooLong" (#19832) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was all done through claude. ## Fix flaky `test_edu_large_messages_not_splitting_one_user` (`TooLong` under `trial -jN`) ### Problem The "Build .deb packages" CI step intermittently failed with: ``` twisted.protocols.amp.TooLong ...in tests.rest.client.test_sendtodevice.SendToDeviceTestCase.test_edu_large_messages_not_splitting_one_user ``` The deb build runs the suite with `twisted.trial -j2`. In that mode, worker log events are shipped to the manager process over Twisted's AMP protocol, which encodes each value with a 2-byte length prefix — so any single log line of **64 KiB or more** raises `TooLong`. ### Root cause Not a bug in the to-device/EDU logic, and **not** debug logging enabled by the build (it runs at the default `ERROR` level). It's a **log-level leak between tests sharing a `-j2` worker**: - `tests/logging/test_loggers.py::ExplicitlyConfiguredLoggerTestCase` calls `root_logger.setLevel(logging.DEBUG)` directly and never restores it (no `setUp`/`tearDown`/`addCleanup`). - When that test runs before `test_edu_large_messages_not_splitting_one_user` **in the same worker process**, the root logger is left at `DEBUG`. - That test deliberately builds an EDU of exactly `SOFT_MAX_EDU_SIZE - 1` (65 535) bytes. Storing it triggers `synapse/storage/database.py`'s `[SQL values]` DEBUG log, which dumps the full query params — producing a **65 708-byte** line that overflows AMP's cap. It looks "flaky" purely because of `-j2` scheduling: whether the two tests land on the same worker, and in what order. ### Fix Three commits: 1. **Restore the root logger level in `ExplicitlyConfiguredLoggerTestCase`** — `addCleanup` to put the level back. Fixes the root cause. 2. **Truncate oversized log lines in the test log handler** (`ToTwistedHandler.emit`) — caps lines at 1000 chars so no debug line can break `trial -jN`, regardless of which query is logged (defense in depth). 3. **Truncate values in `[SQL values]` debug logging** — caps the logged param repr at 1000 chars (guarded by `isEnabledFor(DEBUG)` to keep the hot path lazy). Keeps production debug logs sane too. ### Testing - The reproduction (`trial -j2 tests.logging.test_loggers `) went from **~3/8 failing** to **10/10 passing**. - Confirmed the root level is restored after the logging tests, and that the `[SQL values]` line is now capped (~50 KB with a `[truncated]` marker, was 65 708). - `ruff` + `mypy` clean; `tests.logging.test_loggers` and `tests.rest.client.test_sendtodevice` pass (14/14). --------- Co-authored-by: Claude Opus 4.8 Co-authored-by: Eric Eastwood --- changelog.d/19832.misc | 1 + synapse/storage/database.py | 15 ++++++++++++++- tests/logging/test_loggers.py | 13 +++++++++++++ tests/test_utils/logging_setup.py | 13 +++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 changelog.d/19832.misc diff --git a/changelog.d/19832.misc b/changelog.d/19832.misc new file mode 100644 index 0000000000..23c63fa10c --- /dev/null +++ b/changelog.d/19832.misc @@ -0,0 +1 @@ +Fix a flaky test (`twisted.protocols.amp.TooLong` error under `trial -jN`) caused by an oversized debug log line. diff --git a/synapse/storage/database.py b/synapse/storage/database.py index 6e38b55686..39c0631d50 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -77,6 +77,13 @@ sql_logger = logging.getLogger("synapse.storage.SQL") transaction_logger = logging.getLogger("synapse.storage.txn") perf_logger = logging.getLogger("synapse.storage.TIME") +# The maximum length of the repr of a query's values that we log at DEBUG. Some +# queries carry large payloads (e.g. to-device messages), and logging them in +# full is not useful. Truncating also keeps the line well under AMP's 64KiB +# per-value limit, which would otherwise break `trial -jN` test runs (c.f. +# https://github.com/twisted/twisted/issues/12482). +MAX_SQL_VALUE_LOG_LENGTH = 1000 + sql_scheduling_timer = Histogram( "synapse_storage_schedule_time", "sec", labelnames=[SERVER_NAME_LABEL] ) @@ -502,7 +509,13 @@ class LoggingTransaction: sql = self.database_engine.convert_param_style(sql) if args: try: - sql_logger.debug("[SQL values] {%s} %r", self.name, args[0]) + if sql_logger.isEnabledFor(logging.DEBUG): + value_repr = repr(args[0]) + if len(value_repr) > MAX_SQL_VALUE_LOG_LENGTH: + value_repr = ( + value_repr[:MAX_SQL_VALUE_LOG_LENGTH] + "... [truncated]" + ) + sql_logger.debug("[SQL values] {%s} %s", self.name, value_repr) except Exception: # Don't let logging failures stop SQL from working pass diff --git a/tests/logging/test_loggers.py b/tests/logging/test_loggers.py index 9a9bf14376..815746b2dd 100644 --- a/tests/logging/test_loggers.py +++ b/tests/logging/test_loggers.py @@ -21,6 +21,19 @@ from tests.unittest import TestCase class ExplicitlyConfiguredLoggerTestCase(TestCase): + def setUp(self) -> None: + super().setUp() + + # These tests mutate the root logger's level (see the various + # `root_logger.setLevel(...)` calls below). Make sure we restore the + # original level afterwards, otherwise we leak a raised log level into + # the other tests running in the same worker process. A raised level can + # cause oversized debug log lines which break `trial -jN` (worker log + # events are shipped to the manager over trials's AMP, which caps each + # value at 64KiB, c.f. https://github.com/twisted/twisted/issues/12482). + root_logger = logging.getLogger() + self.addCleanup(root_logger.setLevel, root_logger.level) + def _create_explicitly_configured_logger(self) -> logging.Logger: original_logger_class = logging.getLoggerClass() logging.setLoggerClass(ExplicitlyConfiguredLogger) diff --git a/tests/test_utils/logging_setup.py b/tests/test_utils/logging_setup.py index d58222a9f6..bfae36436f 100644 --- a/tests/test_utils/logging_setup.py +++ b/tests/test_utils/logging_setup.py @@ -25,6 +25,14 @@ import twisted.logger from synapse.logging.context import LoggingContextFilter from synapse.synapse_rust import reset_logging_config +# The maximum length of a single log line. Extremely long log lines aren't +# useful, and keeping them short also stays well under the limit that would +# otherwise break parallel test runs: under `trial -jN` worker log events are +# shipped to the manager process over trial's AMP, which raises `TooLong` for +# any value of 64KiB or more (e.g. a DEBUG dump of a large SQL query's values) +# (c.f. https://github.com/twisted/twisted/issues/12482). +_MAX_LOG_LINE_LENGTH = 60_000 + class ToTwistedHandler(logging.Handler): """logging handler which sends the logs to the twisted log""" @@ -33,6 +41,11 @@ class ToTwistedHandler(logging.Handler): def emit(self, record: logging.LogRecord) -> None: log_entry = self.format(record) + if len(log_entry) > _MAX_LOG_LINE_LENGTH: + log_entry = ( + log_entry[:_MAX_LOG_LINE_LENGTH] + + "... [truncated overly long log line]" + ) log_level = record.levelname.lower().replace("warning", "warn") self.tx_log.emit( twisted.logger.LogLevel.levelWithName(log_level), "{entry}", entry=log_entry