Add limit CLI flags to prune jobs #655

Open
Oneric wants to merge 5 commits from Oneric/akkoma:prune-batch into develop
4 changed files with 154 additions and 53 deletions

View File

@ -12,6 +12,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Akkoma API is now documented
- ability to auto-approve follow requests from users you are already following
- The SimplePolicy MRF can now strip user backgrounds from selected remote hosts
- New standalone `prune_orphaned_activities` mix task with configurable batch limit
- The `prune_objects` mix task now accepts a `--limit` parameter for initial object pruning
## Changed
- OTP builds are now built on erlang OTP26

View File

@ -50,9 +50,37 @@ This will prune remote posts older than 90 days (configurable with [`config :ple
- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity).
- `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote.
- `--limit` - limits how many remote posts get pruned. This limit does **not** apply to any of the follow up jobs. If wanting to keep the database load in check it is thus advisable to run the standalone `prune_orphaned_activities` task with a limit afterwards instead of passing `--prune-orphaned-activities` to this task.
- `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size.
- `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning.
## Prune orphaned activities from the database
This will prune activities which are no longer referenced by anything.
Such activities might be the result of running `prune_objects` without `--prune-orphaned-activities`.
The same notes and warnings apply as for `prune_objects`.
The task will print out how many rows were freed in total in its last
line of output in the form `Deleted 345 rows`.
When running the job in limited batches this can be used to determine
when all orphaned activities have been deleted.
=== "OTP"
```sh
./bin/pleroma_ctl database prune_orphaned_activities [option ...]
```
=== "From Source"
```sh
mix pleroma.database prune_orphaned_activities [option ...]
```
### Options
- `--limit n` - Only delete up to `n` activities in each query making up this job, i.e. if this job runs two queries at most `2n` activities will be deleted. Running this task repeatedly in limited batches can help maintain the instances responsiveness while still freeing up some space.
## Create a conversation for all existing DMs
Can be safely re-run

View File

@ -20,6 +20,63 @@ defmodule Mix.Tasks.Pleroma.Database do
@shortdoc "A collection of database related tasks"
@moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md")
def maybe_limit(query, limit_cnt) do
if is_number(limit_cnt) and limit_cnt > 0 do
limit(query, [], ^limit_cnt)
else
query
end
end
def prune_orphaned_activities(limit \\ 0) when is_number(limit) do
limit_arg =
if limit > 0 do
"LIMIT #{limit}"
else
""
end
# Prune activities who link to a single object
{:ok, %{:num_rows => del_single}} =
"""
delete from public.activities
where id in (
select a.id from public.activities a
left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
left join public.users u on a.data ->> 'object' = u.ap_id
where not a.local
and jsonb_typeof(a."data" -> 'object') = 'string'
and o.id is null
and a2.id is null
and u.id is null
#{limit_arg}
)
"""
|> Repo.query([], timeout: :infinity)
# Prune activities who link to an array of objects
{:ok, %{:num_rows => del_array}} =
"""
delete from public.activities
where id in (
select a.id from public.activities a
join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array'
left join public.objects o on j.value = o.data ->> 'id'
left join public.activities a2 on j.value = a2.data ->> 'id'
left join public.users u on j.value = u.ap_id
group by a.id
having max(o.data ->> 'id') is null
and max(a2.data ->> 'id') is null
and max(u.ap_id) is null
#{limit_arg}
)
"""
|> Repo.query([], timeout: :infinity)
del_single + del_array
end
def run(["remove_embedded_objects" | args]) do
{options, [], []} =
OptionParser.parse(
@ -62,6 +119,35 @@ defmodule Mix.Tasks.Pleroma.Database do
)
end
def run(["prune_orphaned_activities" | args]) do
{options, [], []} =
OptionParser.parse(
args,
strict: [
limit: :integer
]
)
start_pleroma()
limit = Keyword.get(options, :limit, 0)
log_message = "Pruning orphaned activities"
log_message =
if limit > 0 do
log_message <> ", limiting deletion to #{limit} rows"
else
log_message
end
Logger.info(log_message)
deleted = prune_orphaned_activities(limit)
Logger.info("Deleted #{deleted} rows")
end
def run(["prune_objects" | args]) do
{options, [], []} =
OptionParser.parse(
@ -70,7 +156,8 @@ defmodule Mix.Tasks.Pleroma.Database do
vacuum: :boolean,
keep_threads: :boolean,
keep_non_public: :boolean,
prune_orphaned_activities: :boolean
prune_orphaned_activities: :boolean,
limit: :integer
]
)
@ -79,6 +166,8 @@ defmodule Mix.Tasks.Pleroma.Database do
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days])
time_deadline = NaiveDateTime.utc_now() |> NaiveDateTime.add(-(deadline * 86_400))
limit_cnt = Keyword.get(options, :limit, 0)
log_message = "Pruning objects older than #{deadline} days"
log_message =
@ -110,6 +199,13 @@ defmodule Mix.Tasks.Pleroma.Database do
log_message
end
log_message =
if limit_cnt > 0 do
log_message <> ", limiting to #{limit_cnt} rows"
else
log_message
end
Logger.info(log_message)
if Keyword.get(options, :keep_threads) do
@ -143,31 +239,38 @@ defmodule Mix.Tasks.Pleroma.Database do
|> having([a], max(a.updated_at) < ^time_deadline)
|> having([a], not fragment("bool_or(?)", a.local))
|> having([_, b], fragment("max(?::text) is null", b.id))
|> maybe_limit(limit_cnt)
|> select([a], fragment("? ->> 'context'::text", a.data))
Pleroma.Object
|> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context))
else
if Keyword.get(options, :keep_non_public) do
Pleroma.Object
deletable =
if Keyword.get(options, :keep_non_public) do
Pleroma.Object
|> where(
[o],
fragment(
"?->'to' \\? ? OR ?->'cc' \\? ?",
o.data,
^Pleroma.Constants.as_public(),
o.data,
^Pleroma.Constants.as_public()
)
)
else
Pleroma.Object
end
|> where([o], o.updated_at < ^time_deadline)
|> where(
[o],
fragment(
"?->'to' \\? ? OR ?->'cc' \\? ?",
o.data,
^Pleroma.Constants.as_public(),
o.data,
^Pleroma.Constants.as_public()
)
fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host())
)
else
Pleroma.Object
end
|> where([o], o.updated_at < ^time_deadline)
|> where(
[o],
fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host())
)
|> maybe_limit(limit_cnt)
|> select([o], o.id)
Pleroma.Object
|> where([o], o.id in subquery(deletable))
end
|> Repo.delete_all(timeout: :infinity)
@ -187,39 +290,7 @@ defmodule Mix.Tasks.Pleroma.Database do
end
if Keyword.get(options, :prune_orphaned_activities) do
# Prune activities who link to a single object
"""
delete from public.activities
where id in (
select a.id from public.activities a
left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
left join public.users u on a.data ->> 'object' = u.ap_id
where not a.local
and jsonb_typeof(a."data" -> 'object') = 'string'
and o.id is null
and a2.id is null
and u.id is null
)
"""
|> Repo.query([], timeout: :infinity)
# Prune activities who link to an array of objects
"""
delete from public.activities
where id in (
select a.id from public.activities a
join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array'
left join public.objects o on j.value = o.data ->> 'id'
left join public.activities a2 on j.value = a2.data ->> 'id'
left join public.users u on j.value = u.ap_id
group by a.id
having max(o.data ->> 'id') is null
and max(a2.data ->> 'id') is null
and max(u.ap_id) is null
)
"""
|> Repo.query([], timeout: :infinity)
prune_orphaned_activities()
end
"""

View File

@ -470,7 +470,7 @@ defmodule Mix.Tasks.Pleroma.DatabaseTest do
assert length(activities) == 4
end
test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do
test "it prunes orphaned activities with prune_orphaned_activities when the objects are referenced from an array" do
%Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert()
%User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert()
@ -517,7 +517,7 @@ defmodule Mix.Tasks.Pleroma.DatabaseTest do
assert length(Repo.all(Activity)) == 4
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Activity)) == 4
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
Mix.Tasks.Pleroma.Database.run(["prune_orphaned_activities"])
activities = Repo.all(Activity)
assert length(activities) == 3