prune_objects can prune orphaned activities

We add an option to also prune remote activities who don't have existing objects any more they reference.
Rn, we only check for activities who only reference one object, not an array or embeded object.
This commit is contained in:
ilja 2023-01-07 20:52:02 +01:00
parent 9f34294332
commit a7ec6e039c
2 changed files with 167 additions and 1 deletions

View file

@ -69,7 +69,8 @@ def run(["prune_objects" | args]) do
strict: [ strict: [
vacuum: :boolean, vacuum: :boolean,
keep_threads: :boolean, keep_threads: :boolean,
keep_non_public: :boolean keep_non_public: :boolean,
prune_orphaned_activities: :boolean
] ]
) )
@ -94,6 +95,21 @@ def run(["prune_objects" | args]) do
log_message log_message
end end
log_message =
if Keyword.get(options, :prune_orphaned_activities) do
log_message <> ", pruning orphaned activities"
else
log_message
end
log_message =
if Keyword.get(options, :vacuum) do
log_message <>
", doing a full vacuum (you shouldn't do this as a recurring maintanance task)"
else
log_message
end
Logger.info(log_message) Logger.info(log_message)
if Keyword.get(options, :keep_threads) do if Keyword.get(options, :keep_threads) do
@ -155,6 +171,28 @@ def run(["prune_objects" | args]) do
end end
|> Repo.delete_all(timeout: :infinity) |> Repo.delete_all(timeout: :infinity)
if Keyword.get(options, :prune_orphaned_activities) do
"""
delete from public.activities
where id in (
select a.id from public.activities a
left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
left join public.users u on a.data ->> 'object' = u.ap_id
-- Only clean up remote activities
where not a.local
-- For now we only focus on activities with direct links to objects
-- e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects)
and jsonb_typeof(a."data" -> 'object') = 'string'
-- Find Activities that don't have existing objects
and o.id is null
and a2.id is null
and u.id is null
)
"""
|> Repo.query()
end
prune_hashtags_query = """ prune_hashtags_query = """
DELETE FROM hashtags AS ht DELETE FROM hashtags AS ht
WHERE NOT EXISTS ( WHERE NOT EXISTS (

View file

@ -353,6 +353,134 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts"
assert length(Repo.all(Object)) == 1 assert length(Repo.all(Object)) == 1
end end
test "We don't have unexpected tables which can contain objects that are referenced by activities" do
# We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
# If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we
# add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities.
# So when someone adds (or removes) a table, this test will fail.
# Either the table contains objects which can be referenced from the activities table
# => in that case the prune_objects job should be adapted so we don't delete activities who still have the referenced object.
# Or it doesn't contain objects which can be referenced from the activities table
# => in that case you can add/remove the table to/from this (sorted) list.
assert Repo.query!(
"SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';"
).rows
|> Enum.sort() == [
["activities"],
["announcement_read_relationships"],
["announcements"],
["apps"],
["backups"],
["bookmarks"],
["chat_message_references"],
["chats"],
["config"],
["conversation_participation_recipient_ships"],
["conversation_participations"],
["conversations"],
["counter_cache"],
["data_migration_failed_ids"],
["data_migrations"],
["deliveries"],
["filters"],
["following_relationships"],
["hashtags"],
["hashtags_objects"],
["instances"],
["lists"],
["markers"],
["mfa_tokens"],
["moderation_log"],
["notifications"],
["oauth_authorizations"],
["oauth_tokens"],
["oban_jobs"],
["oban_peers"],
["objects"],
["password_reset_tokens"],
["push_subscriptions"],
["registrations"],
["report_notes"],
["scheduled_activities"],
["schema_migrations"],
["thread_mutes"],
["user_follows_hashtag"],
["user_frontend_setting_profiles"],
["user_invite_tokens"],
["user_notes"],
["user_relationships"],
["users"]
]
end
test "it prunes orphaned activities with the --prune-orphaned-activities" do
# Add a remote activity which references an Object
%Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert()
%Activity{}
|> Map.merge(%{
local: false,
data: %{"id" => "remote_activity_with_object", "object" => "object_for_activity"}
})
|> Repo.insert()
# Add a remote activity which references an activity
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_with_activity",
"object" => "remote_activity_with_object"
}
})
|> Repo.insert()
# Add a remote activity which references an Actor
%User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert()
%Activity{}
|> Map.merge(%{
local: false,
data: %{"id" => "remote_activity_with_actor", "object" => "actor"}
})
|> Repo.insert()
# Add a remote activity without existing referenced object, activity or actor
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_without_existing_referenced_object",
"object" => "non_existing"
}
})
|> Repo.insert()
# Add a local activity without existing referenced object, activity or actor
%Activity{}
|> Map.merge(%{
local: true,
data: %{"id" => "local_activity_with_actor", "object" => "non_existing"}
})
|> Repo.insert()
# The remote activities without existing reference, and only the remote activities without existing reference, are deleted
# if, and only if, we provide the --prune-orphaned-activities option
assert length(Repo.all(Activity)) == 5
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Activity)) == 5
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
activities = Repo.all(Activity)
assert "remote_activity_without_existing_referenced_object" not in Enum.map(
activities,
fn a -> a.data["id"] end
)
assert length(activities) == 4
end
end end
describe "running update_users_following_followers_counts" do describe "running update_users_following_followers_counts" do