diff --git a/CHANGELOG.md b/CHANGELOG.md index a611b3c06..ef288366e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Restoring the database from a dump now goes much faster without need for work-arounds +### Added +- Extend the mix task `prune_objects` with option `--prune-orphaned-activities` to also prune orphaned activities, allowing to reclaim even more database space + ## 2023.02 ### Added diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index 915139cf7..3d7424d1c 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -21,7 +21,6 @@ Replaces embedded objects with references to them in the `objects` table. Only n mix pleroma.database remove_embedded_objects [option ...] ``` - ### Options - `--vacuum` - run `VACUUM FULL` after the embedded objects are replaced with their references @@ -29,8 +28,11 @@ Replaces embedded objects with references to them in the `objects` table. Only n This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database. Pruned posts may be refetched in some cases. +!!! note + The disk space will only be reclaimed after a proper vacuum. By default Postgresql does this for you on a regular basis, but if your instance has been running for a long time and there are many rows deleted, it may be advantageous to use `VACUUM FULL` (e.g. by using the `--vacuum` option). + !!! danger - The disk space will only be reclaimed after `VACUUM FULL`. You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. + You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. Vacuum causes a substantial increase in I/O traffic, and may lead to a degraded experience while it is running. === "OTP" @@ -46,9 +48,10 @@ This will prune remote posts older than 90 days (configurable with [`config :ple ### Options -- `--keep-threads` - don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...) -- `--keep-non-public` - keep non-public posts like DM's and followers-only, even if they are remote -- `--vacuum` - run `VACUUM FULL` after the objects are pruned +- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity). +- `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote. +- `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size. +- `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning. ## Create a conversation for all existing DMs @@ -96,6 +99,9 @@ Can be safely re-run ## Vacuum the database +!!! note + By default Postgresql has an autovacuum deamon running. While the tasks described here can help in some cases, they shouldn't be needed on a regular basis. See [the Postgresql docs on vacuuming](https://www.postgresql.org/docs/current/sql-vacuum.html) for more information on this. + ### Analyze Running an `analyze` vacuum job can improve performance by updating statistics used by the query planner. **It is safe to cancel this.** diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index be59e2271..726a22d41 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -69,7 +69,8 @@ def run(["prune_objects" | args]) do strict: [ vacuum: :boolean, keep_threads: :boolean, - keep_non_public: :boolean + keep_non_public: :boolean, + prune_orphaned_activities: :boolean ] ) @@ -94,6 +95,21 @@ def run(["prune_objects" | args]) do log_message end + log_message = + if Keyword.get(options, :prune_orphaned_activities) do + log_message <> ", pruning orphaned activities" + else + log_message + end + + log_message = + if Keyword.get(options, :vacuum) do + log_message <> + ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)" + else + log_message + end + Logger.info(log_message) if Keyword.get(options, :keep_threads) do @@ -155,14 +171,49 @@ def run(["prune_objects" | args]) do end |> Repo.delete_all(timeout: :infinity) - prune_hashtags_query = """ + if Keyword.get(options, :prune_orphaned_activities) do + # Prune activities who link to a single object + """ + delete from public.activities + where id in ( + select a.id from public.activities a + left join public.objects o on a.data ->> 'object' = o.data ->> 'id' + left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' + left join public.users u on a.data ->> 'object' = u.ap_id + where not a.local + and jsonb_typeof(a."data" -> 'object') = 'string' + and o.id is null + and a2.id is null + and u.id is null + ) + """ + |> Repo.query([], timeout: :infinity) + + # Prune activities who link to an array of objects + """ + delete from public.activities + where id in ( + select a.id from public.activities a + join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' + left join public.objects o on j.value = o.data ->> 'id' + left join public.activities a2 on j.value = a2.data ->> 'id' + left join public.users u on j.value = u.ap_id + group by a.id + having max(o.data ->> 'id') is null + and max(a2.data ->> 'id') is null + and max(u.ap_id) is null + ) + """ + |> Repo.query([], timeout: :infinity) + end + + """ DELETE FROM hashtags AS ht WHERE NOT EXISTS ( SELECT 1 FROM hashtags_objects hto WHERE ht.id = hto.hashtag_id) """ - - Repo.query(prune_hashtags_query) + |> Repo.query() if Keyword.get(options, :vacuum) do Maintenance.vacuum("full") diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 447a4404e..9edb2c115 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -353,6 +353,186 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts" assert length(Repo.all(Object)) == 1 end + + test "We don't have unexpected tables which may contain objects that are referenced by activities" do + # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. + # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we + # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities. + # So when someone adds (or removes) a table, this test will fail. + # Either the table contains objects which can be referenced from the activities table + # => in that case the prune_objects job should be adapted so we don't delete activities who still have the referenced object. + # Or it doesn't contain objects which can be referenced from the activities table + # => in that case you can add/remove the table to/from this (sorted) list. + + assert Repo.query!( + "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';" + ).rows + |> Enum.sort() == [ + ["activities"], + ["announcement_read_relationships"], + ["announcements"], + ["apps"], + ["backups"], + ["bookmarks"], + ["chat_message_references"], + ["chats"], + ["config"], + ["conversation_participation_recipient_ships"], + ["conversation_participations"], + ["conversations"], + ["counter_cache"], + ["data_migration_failed_ids"], + ["data_migrations"], + ["deliveries"], + ["filters"], + ["following_relationships"], + ["hashtags"], + ["hashtags_objects"], + ["instances"], + ["lists"], + ["markers"], + ["mfa_tokens"], + ["moderation_log"], + ["notifications"], + ["oauth_authorizations"], + ["oauth_tokens"], + ["oban_jobs"], + ["oban_peers"], + ["objects"], + ["password_reset_tokens"], + ["push_subscriptions"], + ["registrations"], + ["report_notes"], + ["scheduled_activities"], + ["schema_migrations"], + ["thread_mutes"], + ["user_follows_hashtag"], + ["user_frontend_setting_profiles"], + ["user_invite_tokens"], + ["user_notes"], + ["user_relationships"], + ["users"] + ] + end + + test "it prunes orphaned activities with the --prune-orphaned-activities" do + %Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{"id" => "remote_activity_with_object", "object" => "object_for_activity"} + }) + |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_with_activity", + "object" => "remote_activity_with_object" + } + }) + |> Repo.insert() + + %User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{"id" => "remote_activity_with_actor", "object" => "actor"} + }) + |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_without_existing_referenced_object", + "object" => "non_existing" + } + }) + |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: true, + data: %{"id" => "local_activity_with_actor", "object" => "non_existing"} + }) + |> Repo.insert() + + assert length(Repo.all(Activity)) == 5 + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + assert length(Repo.all(Activity)) == 5 + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"]) + activities = Repo.all(Activity) + + assert "remote_activity_without_existing_referenced_object" not in Enum.map( + activities, + fn a -> a.data["id"] end + ) + + assert length(activities) == 4 + end + + test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do + %Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert() + %User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_object", + "object" => ["non_ existing_object", "existing_object"] + } + }) + |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_actor", + "object" => ["non_ existing_object", "existing_actor"] + } + }) + |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_activity", + "object" => ["non_ existing_object", "remote_activity_existing_actor"] + } + }) + |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_without_existing_referenced_object", + "object" => ["owo", "whats_this"] + } + }) + |> Repo.insert() + + assert length(Repo.all(Activity)) == 4 + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + assert length(Repo.all(Activity)) == 4 + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"]) + activities = Repo.all(Activity) + assert length(activities) == 3 + + assert "remote_activity_without_existing_referenced_object" not in Enum.map( + activities, + fn a -> a.data["id"] end + ) + + assert length(activities) == 3 + end end describe "running update_users_following_followers_counts" do