From a7ec6e039cdc9ca5a0cc744fbd51511c5ede9f29 Mon Sep 17 00:00:00 2001 From: ilja Date: Sat, 7 Jan 2023 20:52:02 +0100 Subject: [PATCH] prune_objects can prune orphaned activities We add an option to also prune remote activities who don't have existing objects any more they reference. Rn, we only check for activities who only reference one object, not an array or embeded object. --- lib/mix/tasks/pleroma/database.ex | 40 ++++++- test/mix/tasks/pleroma/database_test.exs | 128 +++++++++++++++++++++++ 2 files changed, 167 insertions(+), 1 deletion(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index be59e2271..0f428ca03 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -69,7 +69,8 @@ def run(["prune_objects" | args]) do strict: [ vacuum: :boolean, keep_threads: :boolean, - keep_non_public: :boolean + keep_non_public: :boolean, + prune_orphaned_activities: :boolean ] ) @@ -94,6 +95,21 @@ def run(["prune_objects" | args]) do log_message end + log_message = + if Keyword.get(options, :prune_orphaned_activities) do + log_message <> ", pruning orphaned activities" + else + log_message + end + + log_message = + if Keyword.get(options, :vacuum) do + log_message <> + ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)" + else + log_message + end + Logger.info(log_message) if Keyword.get(options, :keep_threads) do @@ -155,6 +171,28 @@ def run(["prune_objects" | args]) do end |> Repo.delete_all(timeout: :infinity) + if Keyword.get(options, :prune_orphaned_activities) do + """ + delete from public.activities + where id in ( + select a.id from public.activities a + left join public.objects o on a.data ->> 'object' = o.data ->> 'id' + left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' + left join public.users u on a.data ->> 'object' = u.ap_id + -- Only clean up remote activities + where not a.local + -- For now we only focus on activities with direct links to objects + -- e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects) + and jsonb_typeof(a."data" -> 'object') = 'string' + -- Find Activities that don't have existing objects + and o.id is null + and a2.id is null + and u.id is null + ) + """ + |> Repo.query() + end + prune_hashtags_query = """ DELETE FROM hashtags AS ht WHERE NOT EXISTS ( diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 447a4404e..7f5cd91a9 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -353,6 +353,134 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts" assert length(Repo.all(Object)) == 1 end + + test "We don't have unexpected tables which can contain objects that are referenced by activities" do + # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. + # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we + # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities. + # So when someone adds (or removes) a table, this test will fail. + # Either the table contains objects which can be referenced from the activities table + # => in that case the prune_objects job should be adapted so we don't delete activities who still have the referenced object. + # Or it doesn't contain objects which can be referenced from the activities table + # => in that case you can add/remove the table to/from this (sorted) list. + + assert Repo.query!( + "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';" + ).rows + |> Enum.sort() == [ + ["activities"], + ["announcement_read_relationships"], + ["announcements"], + ["apps"], + ["backups"], + ["bookmarks"], + ["chat_message_references"], + ["chats"], + ["config"], + ["conversation_participation_recipient_ships"], + ["conversation_participations"], + ["conversations"], + ["counter_cache"], + ["data_migration_failed_ids"], + ["data_migrations"], + ["deliveries"], + ["filters"], + ["following_relationships"], + ["hashtags"], + ["hashtags_objects"], + ["instances"], + ["lists"], + ["markers"], + ["mfa_tokens"], + ["moderation_log"], + ["notifications"], + ["oauth_authorizations"], + ["oauth_tokens"], + ["oban_jobs"], + ["oban_peers"], + ["objects"], + ["password_reset_tokens"], + ["push_subscriptions"], + ["registrations"], + ["report_notes"], + ["scheduled_activities"], + ["schema_migrations"], + ["thread_mutes"], + ["user_follows_hashtag"], + ["user_frontend_setting_profiles"], + ["user_invite_tokens"], + ["user_notes"], + ["user_relationships"], + ["users"] + ] + end + + test "it prunes orphaned activities with the --prune-orphaned-activities" do + # Add a remote activity which references an Object + %Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{"id" => "remote_activity_with_object", "object" => "object_for_activity"} + }) + |> Repo.insert() + + # Add a remote activity which references an activity + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_with_activity", + "object" => "remote_activity_with_object" + } + }) + |> Repo.insert() + + # Add a remote activity which references an Actor + %User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert() + + %Activity{} + |> Map.merge(%{ + local: false, + data: %{"id" => "remote_activity_with_actor", "object" => "actor"} + }) + |> Repo.insert() + + # Add a remote activity without existing referenced object, activity or actor + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_without_existing_referenced_object", + "object" => "non_existing" + } + }) + |> Repo.insert() + + # Add a local activity without existing referenced object, activity or actor + %Activity{} + |> Map.merge(%{ + local: true, + data: %{"id" => "local_activity_with_actor", "object" => "non_existing"} + }) + |> Repo.insert() + + # The remote activities without existing reference, and only the remote activities without existing reference, are deleted + # if, and only if, we provide the --prune-orphaned-activities option + assert length(Repo.all(Activity)) == 5 + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + assert length(Repo.all(Activity)) == 5 + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"]) + activities = Repo.all(Activity) + + assert "remote_activity_without_existing_referenced_object" not in Enum.map( + activities, + fn a -> a.data["id"] end + ) + + assert length(activities) == 4 + end end describe "running update_users_following_followers_counts" do