From a7ec6e039cdc9ca5a0cc744fbd51511c5ede9f29 Mon Sep 17 00:00:00 2001 From: ilja Date: Sat, 7 Jan 2023 20:52:02 +0100 Subject: [PATCH 1/4] prune_objects can prune orphaned activities We add an option to also prune remote activities who don't have existing objects any more they reference. Rn, we only check for activities who only reference one object, not an array or embeded object. --- lib/mix/tasks/pleroma/database.ex | 40 ++++++- test/mix/tasks/pleroma/database_test.exs | 128 +++++++++++++++++++++++ 2 files changed, 167 insertions(+), 1 deletion(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index be59e2271..0f428ca03 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -69,7 +69,8 @@ def run(["prune_objects" | args]) do strict: [ vacuum: :boolean, keep_threads: :boolean, - keep_non_public: :boolean + keep_non_public: :boolean, + prune_orphaned_activities: :boolean ] ) @@ -94,6 +95,21 @@ def run(["prune_objects" | args]) do log_message end + log_message = + if Keyword.get(options, :prune_orphaned_activities) do + log_message <> ", pruning orphaned activities" + else + log_message + end + + log_message = + if Keyword.get(options, :vacuum) do + log_message <> + ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)" + else + log_message + end + Logger.info(log_message) if Keyword.get(options, :keep_threads) do @@ -155,6 +171,28 @@ def run(["prune_objects" | args]) do end |> Repo.delete_all(timeout: :infinity) + if Keyword.get(options, :prune_orphaned_activities) do + """ + delete from public.activities + where id in ( + select a.id from public.activities a + left join public.objects o on a.data ->> 'object' = o.data ->> 'id' + left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' + left join public.users u on a.data ->> 'object' = u.ap_id + -- Only clean up remote activities + where not a.local + -- For now we only focus on activities with direct links to objects + -- e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects) + and jsonb_typeof(a."data" -> 'object') = 'string' + -- Find Activities that don't have existing objects + and o.id is null + and a2.id is null + and u.id is null + ) + """ + |> Repo.query() + end + prune_hashtags_query = """ DELETE FROM hashtags AS ht WHERE NOT EXISTS ( diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 447a4404e..7f5cd91a9 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -353,6 +353,134 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts" assert length(Repo.all(Object)) == 1 end + + test "We don't have unexpected tables which can contain objects that are referenced by activities" do + # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. + # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we + # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities. + # So when someone adds (or removes) a table, this test will fail. + # Either the table contains objects which can be referenced from the activities table + # => in that case the prune_objects job should be adapted so we don't delete activities who still have the referenced object. + # Or it doesn't contain objects which can be referenced from the activities table + # => in that case you can add/remove the table to/from this (sorted) list. + + assert Repo.query!( + "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';" + ).rows + |> Enum.sort() == [ + ["activities"], + ["announcement_read_relationships"], + ["announcements"], + ["apps"], + ["backups"], + ["bookmarks"], + ["chat_message_references"], + ["chats"], + ["config"], + ["conversation_participation_recipient_ships"], + ["conversation_participations"], + ["conversations"], + ["counter_cache"], + ["data_migration_failed_ids"], + ["data_migrations"], + ["deliveries"], + ["filters"], + ["following_relationships"], + ["hashtags"], + ["hashtags_objects"], + ["instances"], + ["lists"], + ["markers"], + ["mfa_tokens"], + ["moderation_log"], + ["notifications"], + ["oauth_authorizations"], + ["oauth_tokens"], + ["oban_jobs"], + ["oban_peers"], + ["objects"], + ["password_reset_tokens"], + ["push_subscriptions"], + ["registrations"], + ["report_notes"], + ["scheduled_activities"], + ["schema_migrations"], + ["thread_mutes"], + ["user_follows_hashtag"], + ["user_frontend_setting_profiles"], + ["user_invite_tokens"], + ["user_notes"], + ["user_relationships"], + ["users"] + ] + end + + test "it prunes orphaned activities with the --prune-orphaned-activities" do + # Add a remote activity which references an Object + %Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{"id" => "remote_activity_with_object", "object" => "object_for_activity"} + }) + |> Repo.insert() + + # Add a remote activity which references an activity + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_with_activity", + "object" => "remote_activity_with_object" + } + }) + |> Repo.insert() + + # Add a remote activity which references an Actor + %User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{"id" => "remote_activity_with_actor", "object" => "actor"} + }) + |> Repo.insert() + + # Add a remote activity without existing referenced object, activity or actor + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_without_existing_referenced_object", + "object" => "non_existing" + } + }) + |> Repo.insert() + + # Add a local activity without existing referenced object, activity or actor + %Activity{} + |> Map.merge(%{ + local: true, + data: %{"id" => "local_activity_with_actor", "object" => "non_existing"} + }) + |> Repo.insert() + + # The remote activities without existing reference, and only the remote activities without existing reference, are deleted + # if, and only if, we provide the --prune-orphaned-activities option + assert length(Repo.all(Activity)) == 5 + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + assert length(Repo.all(Activity)) == 5 + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"]) + activities = Repo.all(Activity) + + assert "remote_activity_without_existing_referenced_object" not in Enum.map( + activities, + fn a -> a.data["id"] end + ) + + assert length(activities) == 4 + end end describe "running update_users_following_followers_counts" do From 57eef6d76492e772f83acba2402d50ecb6a69f6b Mon Sep 17 00:00:00 2001 From: ilja Date: Sun, 8 Jan 2023 18:22:53 +0100 Subject: [PATCH 2/4] prune_objects can prune orphaned activities who reference an array of objects E.g. Flag activities have an array of objects We prune the activity when NONE of the objects can be found Note that the cost of finding and deleting these is ~4x higher than finding and deleting the non-array ones Only string: Delete on activities (cost=506573.48..506580.38 rows=0 width=0) Only Array: Delete on activities (cost=3570359.68..4276365.34 rows=0 width=0) (They are still executed separately, so the total cost is the sum of the two) --- lib/mix/tasks/pleroma/database.ex | 47 ++++++++++------- test/mix/tasks/pleroma/database_test.exs | 65 +++++++++++++++++++++++- 2 files changed, 94 insertions(+), 18 deletions(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 0f428ca03..726a22d41 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -172,35 +172,48 @@ def run(["prune_objects" | args]) do |> Repo.delete_all(timeout: :infinity) if Keyword.get(options, :prune_orphaned_activities) do + # Prune activities who link to a single object """ delete from public.activities where id in ( - select a.id from public.activities a - left join public.objects o on a.data ->> 'object' = o.data ->> 'id' - left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' - left join public.users u on a.data ->> 'object' = u.ap_id - -- Only clean up remote activities - where not a.local - -- For now we only focus on activities with direct links to objects - -- e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects) - and jsonb_typeof(a."data" -> 'object') = 'string' - -- Find Activities that don't have existing objects - and o.id is null - and a2.id is null - and u.id is null + select a.id from public.activities a + left join public.objects o on a.data ->> 'object' = o.data ->> 'id' + left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' + left join public.users u on a.data ->> 'object' = u.ap_id + where not a.local + and jsonb_typeof(a."data" -> 'object') = 'string' + and o.id is null + and a2.id is null + and u.id is null ) """ - |> Repo.query() + |> Repo.query([], timeout: :infinity) + + # Prune activities who link to an array of objects + """ + delete from public.activities + where id in ( + select a.id from public.activities a + join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' + left join public.objects o on j.value = o.data ->> 'id' + left join public.activities a2 on j.value = a2.data ->> 'id' + left join public.users u on j.value = u.ap_id + group by a.id + having max(o.data ->> 'id') is null + and max(a2.data ->> 'id') is null + and max(u.ap_id) is null + ) + """ + |> Repo.query([], timeout: :infinity) end - prune_hashtags_query = """ + """ DELETE FROM hashtags AS ht WHERE NOT EXISTS ( SELECT 1 FROM hashtags_objects hto WHERE ht.id = hto.hashtag_id) """ - - Repo.query(prune_hashtags_query) + |> Repo.query() if Keyword.get(options, :vacuum) do Maintenance.vacuum("full") diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 7f5cd91a9..402856f3d 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -354,7 +354,7 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts" assert length(Repo.all(Object)) == 1 end - test "We don't have unexpected tables which can contain objects that are referenced by activities" do + test "We don't have unexpected tables which may contain objects that are referenced by activities" do # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities. @@ -481,6 +481,69 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do assert length(activities) == 4 end + + test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do + %Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert() + %User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert() + + # Multiple objects, one object exists (keep) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_object", + "object" => ["non_ existing_object", "existing_object"] + } + }) + |> Repo.insert() + + # Multiple objects, one actor exists (keep) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_actor", + "object" => ["non_ existing_object", "existing_actor"] + } + }) + |> Repo.insert() + + # Multiple objects, one activity exists (keep) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_activity", + "object" => ["non_ existing_object", "remote_activity_existing_actor"] + } + }) + |> Repo.insert() + + # Multiple objects none exist (prune) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_without_existing_referenced_object", + "object" => ["owo", "whats_this"] + } + }) + |> Repo.insert() + + assert length(Repo.all(Activity)) == 4 + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + assert length(Repo.all(Activity)) == 4 + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"]) + activities = Repo.all(Activity) + assert length(activities) == 3 + + assert "remote_activity_without_existing_referenced_object" not in Enum.map( + activities, + fn a -> a.data["id"] end + ) + + assert length(activities) == 3 + end end describe "running update_users_following_followers_counts" do From c1c962e1a80873a22cdc328fffb446f550892b10 Mon Sep 17 00:00:00 2001 From: ilja Date: Mon, 23 Jan 2023 09:07:44 +0100 Subject: [PATCH 3/4] Add docs for pleroma_ctl database prune_objects --prune-orphaned-activities I also added extra info on VACUUM FULL --- docs/docs/administration/CLI_tasks/database.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index 915139cf7..3d7424d1c 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -21,7 +21,6 @@ Replaces embedded objects with references to them in the `objects` table. Only n mix pleroma.database remove_embedded_objects [option ...] ``` - ### Options - `--vacuum` - run `VACUUM FULL` after the embedded objects are replaced with their references @@ -29,8 +28,11 @@ Replaces embedded objects with references to them in the `objects` table. Only n This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database. Pruned posts may be refetched in some cases. +!!! note + The disk space will only be reclaimed after a proper vacuum. By default Postgresql does this for you on a regular basis, but if your instance has been running for a long time and there are many rows deleted, it may be advantageous to use `VACUUM FULL` (e.g. by using the `--vacuum` option). + !!! danger - The disk space will only be reclaimed after `VACUUM FULL`. You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. + You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. Vacuum causes a substantial increase in I/O traffic, and may lead to a degraded experience while it is running. === "OTP" @@ -46,9 +48,10 @@ This will prune remote posts older than 90 days (configurable with [`config :ple ### Options -- `--keep-threads` - don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...) -- `--keep-non-public` - keep non-public posts like DM's and followers-only, even if they are remote -- `--vacuum` - run `VACUUM FULL` after the objects are pruned +- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity). +- `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote. +- `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size. +- `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning. ## Create a conversation for all existing DMs @@ -96,6 +99,9 @@ Can be safely re-run ## Vacuum the database +!!! note + By default Postgresql has an autovacuum deamon running. While the tasks described here can help in some cases, they shouldn't be needed on a regular basis. See [the Postgresql docs on vacuuming](https://www.postgresql.org/docs/current/sql-vacuum.html) for more information on this. + ### Analyze Running an `analyze` vacuum job can improve performance by updating statistics used by the query planner. **It is safe to cancel this.** From 328b4d93b750f181966b08bff048f0912e2ad6b3 Mon Sep 17 00:00:00 2001 From: ilja Date: Mon, 23 Jan 2023 09:43:16 +0100 Subject: [PATCH 4/4] Changelog + remove some unneeded comments from the tests --- CHANGELOG.md | 4 +++- test/mix/tasks/pleroma/database_test.exs | 11 ----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1da10fae4..0295b1860 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## Unreleased ## Fixed - - Allowed contentMap to be updated on edit +### Added +- Extend the mix task `prune_objects` with option `--prune-orphaned-activities` to also prune orphaned activities, allowing to reclaim even more database space + ## 2023.02 ### Added diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 402856f3d..9edb2c115 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -416,7 +416,6 @@ test "We don't have unexpected tables which may contain objects that are referen end test "it prunes orphaned activities with the --prune-orphaned-activities" do - # Add a remote activity which references an Object %Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert() %Activity{} @@ -426,7 +425,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do }) |> Repo.insert() - # Add a remote activity which references an activity %Activity{} |> Map.merge(%{ local: false, @@ -437,7 +435,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do }) |> Repo.insert() - # Add a remote activity which references an Actor %User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert() %Activity{} @@ -447,7 +444,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do }) |> Repo.insert() - # Add a remote activity without existing referenced object, activity or actor %Activity{} |> Map.merge(%{ local: false, @@ -458,7 +454,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do }) |> Repo.insert() - # Add a local activity without existing referenced object, activity or actor %Activity{} |> Map.merge(%{ local: true, @@ -466,8 +461,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do }) |> Repo.insert() - # The remote activities without existing reference, and only the remote activities without existing reference, are deleted - # if, and only if, we provide the --prune-orphaned-activities option assert length(Repo.all(Activity)) == 5 Mix.Tasks.Pleroma.Database.run(["prune_objects"]) assert length(Repo.all(Activity)) == 5 @@ -486,7 +479,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities when th %Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert() %User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert() - # Multiple objects, one object exists (keep) %Activity{} |> Map.merge(%{ local: false, @@ -497,7 +489,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities when th }) |> Repo.insert() - # Multiple objects, one actor exists (keep) %Activity{} |> Map.merge(%{ local: false, @@ -508,7 +499,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities when th }) |> Repo.insert() - # Multiple objects, one activity exists (keep) %Activity{} |> Map.merge(%{ local: false, @@ -519,7 +509,6 @@ test "it prunes orphaned activities with the --prune-orphaned-activities when th }) |> Repo.insert() - # Multiple objects none exist (prune) %Activity{} |> Map.merge(%{ local: false,