From 3cdae2e27828daf9508144fe76423e45cb796853 Mon Sep 17 00:00:00 2001
From: Erik Johnston <erikj@element.io>
Date: Tue, 21 Apr 2026 11:39:39 +0100
Subject: [PATCH] Fix race in new pruning of device lists tables. (#19709)

Follows on from #19473.

We should be recording where we have deleted up to in the same
transaction as we perform the delete, rather than at the end. This code
only starts deleting rows after a month (and the original PR isn't in a
release yet), so no server should have run into this problem yet.

Also let's log more regularly, as the initial set of deletions will
likely take a long time.
---
 changelog.d/19709.misc                    |  1 +
 synapse/storage/databases/main/devices.py | 44 ++++++++++++++---------
 2 files changed, 28 insertions(+), 17 deletions(-)
 create mode 100644 changelog.d/19709.misc

diff --git a/changelog.d/19709.misc b/changelog.d/19709.misc
new file mode 100644
index 0000000000..596d8a6b26
--- /dev/null
+++ b/changelog.d/19709.misc
@@ -0,0 +1 @@
+Reduce database disk space usage by pruning old rows from `device_lists_changes_in_room`.
diff --git a/synapse/storage/databases/main/devices.py b/synapse/storage/databases/main/devices.py
index 339fb8a6f7..8670d68f38 100644
--- a/synapse/storage/databases/main/devices.py
+++ b/synapse/storage/databases/main/devices.py
@@ -2583,36 +2583,46 @@ class DeviceWorkerStore(RoomMemberWorkerStore, EndToEndKeyWorkerStore):
                 num_deleted += 1
                 min_stream_id = max(min_stream_id, row[0])
 
+            if num_deleted:
+                # Update the max pruned stream ID tracking table so that the
+                # safety check knows data up to this point has been deleted.
+                self.db_pool.simple_update_one_txn(
+                    txn,
+                    table="device_lists_changes_in_room_max_pruned_stream_id",
+                    keyvalues={},
+                    updatevalues={"stream_id": min_stream_id},
+                )
+
             return num_deleted
 
-        num_rows_deleted = 0
+        progress_num_rows_deleted = 0
         while True:
             batch_deleted = await self.db_pool.runInteraction(
                 "prune_device_lists_changes_in_room",
                 prune_device_lists_changes_in_room_txn,
             )
-            num_rows_deleted += batch_deleted
-            if batch_deleted < PRUNE_DEVICE_LISTS_BATCH_SIZE:
+
+            finished = batch_deleted < PRUNE_DEVICE_LISTS_BATCH_SIZE
+
+            progress_num_rows_deleted += batch_deleted
+
+            # Periodically report progress in the logs. We do this either when
+            # we've deleted a significant number of rows or when we've finished
+            # deleting all rows in this round.
+            if finished or progress_num_rows_deleted > 10000:
+                logger.info(
+                    "Pruned %d rows from device_lists_changes_in_room",
+                    progress_num_rows_deleted,
+                )
+                progress_num_rows_deleted = 0
+
+            if finished:
                 break
 
             # Sleep for a short time to avoid hammering the database too much if
             # there are a lot of rows to delete.
             await self.clock.sleep(Duration(milliseconds=100))
 
-        if num_rows_deleted:
-            # Update the max pruned stream ID tracking table so that the
-            # safety check knows data up to this point has been deleted.
-            await self.db_pool.simple_update_one(
-                table="device_lists_changes_in_room_max_pruned_stream_id",
-                keyvalues={},
-                updatevalues={"stream_id": prune_before_stream_id},
-                desc="prune_device_lists_changes_in_room_update_max_pruned",
-            )
-
-            logger.info(
-                "Pruned %d rows from device_lists_changes_in_room", num_rows_deleted
-            )
-
 
 class DeviceBackgroundUpdateStore(SQLBaseStore):
     _instance_name: str