Merge branch 'delete_orphaned_activities' into develop
Some checks are pending
ci/woodpecker/push/woodpecker Pipeline is pending
Some checks are pending
ci/woodpecker/push/woodpecker Pipeline is pending
This commit is contained in:
commit
f56e3098ef
4 changed files with 249 additions and 9 deletions
|
@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
### Changed
|
||||
- Restoring the database from a dump now goes much faster without need for work-arounds
|
||||
|
||||
### Added
|
||||
- Extend the mix task `prune_objects` with option `--prune-orphaned-activities` to also prune orphaned activities, allowing to reclaim even more database space
|
||||
|
||||
## 2023.02
|
||||
|
||||
### Added
|
||||
|
|
|
@ -21,7 +21,6 @@ Replaces embedded objects with references to them in the `objects` table. Only n
|
|||
mix pleroma.database remove_embedded_objects [option ...]
|
||||
```
|
||||
|
||||
|
||||
### Options
|
||||
- `--vacuum` - run `VACUUM FULL` after the embedded objects are replaced with their references
|
||||
|
||||
|
@ -29,8 +28,11 @@ Replaces embedded objects with references to them in the `objects` table. Only n
|
|||
|
||||
This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database. Pruned posts may be refetched in some cases.
|
||||
|
||||
!!! note
|
||||
The disk space will only be reclaimed after a proper vacuum. By default Postgresql does this for you on a regular basis, but if your instance has been running for a long time and there are many rows deleted, it may be advantageous to use `VACUUM FULL` (e.g. by using the `--vacuum` option).
|
||||
|
||||
!!! danger
|
||||
The disk space will only be reclaimed after `VACUUM FULL`. You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free.
|
||||
You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. Vacuum causes a substantial increase in I/O traffic, and may lead to a degraded experience while it is running.
|
||||
|
||||
=== "OTP"
|
||||
|
||||
|
@ -46,9 +48,10 @@ This will prune remote posts older than 90 days (configurable with [`config :ple
|
|||
|
||||
### Options
|
||||
|
||||
- `--keep-threads` - don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...)
|
||||
- `--keep-non-public` - keep non-public posts like DM's and followers-only, even if they are remote
|
||||
- `--vacuum` - run `VACUUM FULL` after the objects are pruned
|
||||
- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity).
|
||||
- `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote.
|
||||
- `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size.
|
||||
- `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning.
|
||||
|
||||
## Create a conversation for all existing DMs
|
||||
|
||||
|
@ -96,6 +99,9 @@ Can be safely re-run
|
|||
|
||||
## Vacuum the database
|
||||
|
||||
!!! note
|
||||
By default Postgresql has an autovacuum deamon running. While the tasks described here can help in some cases, they shouldn't be needed on a regular basis. See [the Postgresql docs on vacuuming](https://www.postgresql.org/docs/current/sql-vacuum.html) for more information on this.
|
||||
|
||||
### Analyze
|
||||
|
||||
Running an `analyze` vacuum job can improve performance by updating statistics used by the query planner. **It is safe to cancel this.**
|
||||
|
|
|
@ -69,7 +69,8 @@ def run(["prune_objects" | args]) do
|
|||
strict: [
|
||||
vacuum: :boolean,
|
||||
keep_threads: :boolean,
|
||||
keep_non_public: :boolean
|
||||
keep_non_public: :boolean,
|
||||
prune_orphaned_activities: :boolean
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -94,6 +95,21 @@ def run(["prune_objects" | args]) do
|
|||
log_message
|
||||
end
|
||||
|
||||
log_message =
|
||||
if Keyword.get(options, :prune_orphaned_activities) do
|
||||
log_message <> ", pruning orphaned activities"
|
||||
else
|
||||
log_message
|
||||
end
|
||||
|
||||
log_message =
|
||||
if Keyword.get(options, :vacuum) do
|
||||
log_message <>
|
||||
", doing a full vacuum (you shouldn't do this as a recurring maintanance task)"
|
||||
else
|
||||
log_message
|
||||
end
|
||||
|
||||
Logger.info(log_message)
|
||||
|
||||
if Keyword.get(options, :keep_threads) do
|
||||
|
@ -155,14 +171,49 @@ def run(["prune_objects" | args]) do
|
|||
end
|
||||
|> Repo.delete_all(timeout: :infinity)
|
||||
|
||||
prune_hashtags_query = """
|
||||
if Keyword.get(options, :prune_orphaned_activities) do
|
||||
# Prune activities who link to a single object
|
||||
"""
|
||||
delete from public.activities
|
||||
where id in (
|
||||
select a.id from public.activities a
|
||||
left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
|
||||
left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
|
||||
left join public.users u on a.data ->> 'object' = u.ap_id
|
||||
where not a.local
|
||||
and jsonb_typeof(a."data" -> 'object') = 'string'
|
||||
and o.id is null
|
||||
and a2.id is null
|
||||
and u.id is null
|
||||
)
|
||||
"""
|
||||
|> Repo.query([], timeout: :infinity)
|
||||
|
||||
# Prune activities who link to an array of objects
|
||||
"""
|
||||
delete from public.activities
|
||||
where id in (
|
||||
select a.id from public.activities a
|
||||
join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array'
|
||||
left join public.objects o on j.value = o.data ->> 'id'
|
||||
left join public.activities a2 on j.value = a2.data ->> 'id'
|
||||
left join public.users u on j.value = u.ap_id
|
||||
group by a.id
|
||||
having max(o.data ->> 'id') is null
|
||||
and max(a2.data ->> 'id') is null
|
||||
and max(u.ap_id) is null
|
||||
)
|
||||
"""
|
||||
|> Repo.query([], timeout: :infinity)
|
||||
end
|
||||
|
||||
"""
|
||||
DELETE FROM hashtags AS ht
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM hashtags_objects hto
|
||||
WHERE ht.id = hto.hashtag_id)
|
||||
"""
|
||||
|
||||
Repo.query(prune_hashtags_query)
|
||||
|> Repo.query()
|
||||
|
||||
if Keyword.get(options, :vacuum) do
|
||||
Maintenance.vacuum("full")
|
||||
|
|
|
@ -353,6 +353,186 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts"
|
|||
|
||||
assert length(Repo.all(Object)) == 1
|
||||
end
|
||||
|
||||
test "We don't have unexpected tables which may contain objects that are referenced by activities" do
|
||||
# We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
|
||||
# If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we
|
||||
# add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities.
|
||||
# So when someone adds (or removes) a table, this test will fail.
|
||||
# Either the table contains objects which can be referenced from the activities table
|
||||
# => in that case the prune_objects job should be adapted so we don't delete activities who still have the referenced object.
|
||||
# Or it doesn't contain objects which can be referenced from the activities table
|
||||
# => in that case you can add/remove the table to/from this (sorted) list.
|
||||
|
||||
assert Repo.query!(
|
||||
"SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';"
|
||||
).rows
|
||||
|> Enum.sort() == [
|
||||
["activities"],
|
||||
["announcement_read_relationships"],
|
||||
["announcements"],
|
||||
["apps"],
|
||||
["backups"],
|
||||
["bookmarks"],
|
||||
["chat_message_references"],
|
||||
["chats"],
|
||||
["config"],
|
||||
["conversation_participation_recipient_ships"],
|
||||
["conversation_participations"],
|
||||
["conversations"],
|
||||
["counter_cache"],
|
||||
["data_migration_failed_ids"],
|
||||
["data_migrations"],
|
||||
["deliveries"],
|
||||
["filters"],
|
||||
["following_relationships"],
|
||||
["hashtags"],
|
||||
["hashtags_objects"],
|
||||
["instances"],
|
||||
["lists"],
|
||||
["markers"],
|
||||
["mfa_tokens"],
|
||||
["moderation_log"],
|
||||
["notifications"],
|
||||
["oauth_authorizations"],
|
||||
["oauth_tokens"],
|
||||
["oban_jobs"],
|
||||
["oban_peers"],
|
||||
["objects"],
|
||||
["password_reset_tokens"],
|
||||
["push_subscriptions"],
|
||||
["registrations"],
|
||||
["report_notes"],
|
||||
["scheduled_activities"],
|
||||
["schema_migrations"],
|
||||
["thread_mutes"],
|
||||
["user_follows_hashtag"],
|
||||
["user_frontend_setting_profiles"],
|
||||
["user_invite_tokens"],
|
||||
["user_notes"],
|
||||
["user_relationships"],
|
||||
["users"]
|
||||
]
|
||||
end
|
||||
|
||||
test "it prunes orphaned activities with the --prune-orphaned-activities" do
|
||||
%Object{} |> Map.merge(%{data: %{"id" => "object_for_activity"}}) |> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{"id" => "remote_activity_with_object", "object" => "object_for_activity"}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{
|
||||
"id" => "remote_activity_with_activity",
|
||||
"object" => "remote_activity_with_object"
|
||||
}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
%User{} |> Map.merge(%{ap_id: "actor"}) |> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{"id" => "remote_activity_with_actor", "object" => "actor"}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{
|
||||
"id" => "remote_activity_without_existing_referenced_object",
|
||||
"object" => "non_existing"
|
||||
}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: true,
|
||||
data: %{"id" => "local_activity_with_actor", "object" => "non_existing"}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
assert length(Repo.all(Activity)) == 5
|
||||
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
|
||||
assert length(Repo.all(Activity)) == 5
|
||||
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
|
||||
activities = Repo.all(Activity)
|
||||
|
||||
assert "remote_activity_without_existing_referenced_object" not in Enum.map(
|
||||
activities,
|
||||
fn a -> a.data["id"] end
|
||||
)
|
||||
|
||||
assert length(activities) == 4
|
||||
end
|
||||
|
||||
test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do
|
||||
%Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert()
|
||||
%User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{
|
||||
"id" => "remote_activity_existing_object",
|
||||
"object" => ["non_ existing_object", "existing_object"]
|
||||
}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{
|
||||
"id" => "remote_activity_existing_actor",
|
||||
"object" => ["non_ existing_object", "existing_actor"]
|
||||
}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{
|
||||
"id" => "remote_activity_existing_activity",
|
||||
"object" => ["non_ existing_object", "remote_activity_existing_actor"]
|
||||
}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
%Activity{}
|
||||
|> Map.merge(%{
|
||||
local: false,
|
||||
data: %{
|
||||
"id" => "remote_activity_without_existing_referenced_object",
|
||||
"object" => ["owo", "whats_this"]
|
||||
}
|
||||
})
|
||||
|> Repo.insert()
|
||||
|
||||
assert length(Repo.all(Activity)) == 4
|
||||
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
|
||||
assert length(Repo.all(Activity)) == 4
|
||||
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
|
||||
activities = Repo.all(Activity)
|
||||
assert length(activities) == 3
|
||||
|
||||
assert "remote_activity_without_existing_referenced_object" not in Enum.map(
|
||||
activities,
|
||||
fn a -> a.data["id"] end
|
||||
)
|
||||
|
||||
assert length(activities) == 3
|
||||
end
|
||||
end
|
||||
|
||||
describe "running update_users_following_followers_counts" do
|
||||
|
|
Loading…
Reference in a new issue