From 3cdae2e27828daf9508144fe76423e45cb796853 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 21 Apr 2026 11:39:39 +0100 Subject: [PATCH] Fix race in new pruning of device lists tables. (#19709) Follows on from #19473. We should be recording where we have deleted up to in the same transaction as we perform the delete, rather than at the end. This code only starts deleting rows after a month (and the original PR isn't in a release yet), so no server should have run into this problem yet. Also let's log more regularly, as the initial set of deletions will likely take a long time. --- changelog.d/19709.misc | 1 + synapse/storage/databases/main/devices.py | 44 ++++++++++++++--------- 2 files changed, 28 insertions(+), 17 deletions(-) create mode 100644 changelog.d/19709.misc diff --git a/changelog.d/19709.misc b/changelog.d/19709.misc new file mode 100644 index 0000000000..596d8a6b26 --- /dev/null +++ b/changelog.d/19709.misc @@ -0,0 +1 @@ +Reduce database disk space usage by pruning old rows from `device_lists_changes_in_room`. diff --git a/synapse/storage/databases/main/devices.py b/synapse/storage/databases/main/devices.py index 339fb8a6f7..8670d68f38 100644 --- a/synapse/storage/databases/main/devices.py +++ b/synapse/storage/databases/main/devices.py @@ -2583,36 +2583,46 @@ class DeviceWorkerStore(RoomMemberWorkerStore, EndToEndKeyWorkerStore): num_deleted += 1 min_stream_id = max(min_stream_id, row[0]) + if num_deleted: + # Update the max pruned stream ID tracking table so that the + # safety check knows data up to this point has been deleted. + self.db_pool.simple_update_one_txn( + txn, + table="device_lists_changes_in_room_max_pruned_stream_id", + keyvalues={}, + updatevalues={"stream_id": min_stream_id}, + ) + return num_deleted - num_rows_deleted = 0 + progress_num_rows_deleted = 0 while True: batch_deleted = await self.db_pool.runInteraction( "prune_device_lists_changes_in_room", prune_device_lists_changes_in_room_txn, ) - num_rows_deleted += batch_deleted - if batch_deleted < PRUNE_DEVICE_LISTS_BATCH_SIZE: + + finished = batch_deleted < PRUNE_DEVICE_LISTS_BATCH_SIZE + + progress_num_rows_deleted += batch_deleted + + # Periodically report progress in the logs. We do this either when + # we've deleted a significant number of rows or when we've finished + # deleting all rows in this round. + if finished or progress_num_rows_deleted > 10000: + logger.info( + "Pruned %d rows from device_lists_changes_in_room", + progress_num_rows_deleted, + ) + progress_num_rows_deleted = 0 + + if finished: break # Sleep for a short time to avoid hammering the database too much if # there are a lot of rows to delete. await self.clock.sleep(Duration(milliseconds=100)) - if num_rows_deleted: - # Update the max pruned stream ID tracking table so that the - # safety check knows data up to this point has been deleted. - await self.db_pool.simple_update_one( - table="device_lists_changes_in_room_max_pruned_stream_id", - keyvalues={}, - updatevalues={"stream_id": prune_before_stream_id}, - desc="prune_device_lists_changes_in_room_update_max_pruned", - ) - - logger.info( - "Pruned %d rows from device_lists_changes_in_room", num_rows_deleted - ) - class DeviceBackgroundUpdateStore(SQLBaseStore): _instance_name: str