From f5b5838c4db7937837104442dbf580500cda33e9 Mon Sep 17 00:00:00 2001 From: Oneric Date: Mon, 23 Oct 2023 00:52:34 +0200 Subject: [PATCH 01/10] refactor: move prune_orphaned_activities into own function No logic changes. Preparation for standalone orphan pruning. --- lib/mix/tasks/pleroma/database.ex | 70 ++++++++++++++++--------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 09d2a4072..20d035dfd 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -20,6 +20,42 @@ defmodule Mix.Tasks.Pleroma.Database do @shortdoc "A collection of database related tasks" @moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md") + def prune_orphaned_activities() do + # Prune activities who link to a single object + """ + delete from public.activities + where id in ( + select a.id from public.activities a + left join public.objects o on a.data ->> 'object' = o.data ->> 'id' + left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' + left join public.users u on a.data ->> 'object' = u.ap_id + where not a.local + and jsonb_typeof(a."data" -> 'object') = 'string' + and o.id is null + and a2.id is null + and u.id is null + ) + """ + |> Repo.query([], timeout: :infinity) + + # Prune activities who link to an array of objects + """ + delete from public.activities + where id in ( + select a.id from public.activities a + join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' + left join public.objects o on j.value = o.data ->> 'id' + left join public.activities a2 on j.value = a2.data ->> 'id' + left join public.users u on j.value = u.ap_id + group by a.id + having max(o.data ->> 'id') is null + and max(a2.data ->> 'id') is null + and max(u.ap_id) is null + ) + """ + |> Repo.query([], timeout: :infinity) + end + def run(["remove_embedded_objects" | args]) do {options, [], []} = OptionParser.parse( @@ -187,39 +223,7 @@ def run(["prune_objects" | args]) do end if Keyword.get(options, :prune_orphaned_activities) do - # Prune activities who link to a single object - """ - delete from public.activities - where id in ( - select a.id from public.activities a - left join public.objects o on a.data ->> 'object' = o.data ->> 'id' - left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' - left join public.users u on a.data ->> 'object' = u.ap_id - where not a.local - and jsonb_typeof(a."data" -> 'object') = 'string' - and o.id is null - and a2.id is null - and u.id is null - ) - """ - |> Repo.query([], timeout: :infinity) - - # Prune activities who link to an array of objects - """ - delete from public.activities - where id in ( - select a.id from public.activities a - join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' - left join public.objects o on j.value = o.data ->> 'id' - left join public.activities a2 on j.value = a2.data ->> 'id' - left join public.users u on j.value = u.ap_id - group by a.id - having max(o.data ->> 'id') is null - and max(a2.data ->> 'id') is null - and max(u.ap_id) is null - ) - """ - |> Repo.query([], timeout: :infinity) + prune_orphaned_activities() end """ From ff684ba8eab753dc56af5d79b7e2ffc6c1b6d09b Mon Sep 17 00:00:00 2001 From: Oneric Date: Mon, 23 Oct 2023 01:01:07 +0200 Subject: [PATCH 02/10] Add standalone prune_orphaned_activities CLI task This part of pruning can be very expensive and bog down the whole instance to an unusable sate for a long time. It can thus be desireable to split it from prune_objects and run it on its own in smaller limited batches. If the batches are smaller enough and spaced out a bit, it may even be possible to avoid any downtime. If not, the limit can still help to at least make the downtime duration somewhat more predictable. --- CHANGELOG.md | 1 + .../docs/administration/CLI_tasks/database.md | 22 +++++++++++ lib/mix/tasks/pleroma/database.ex | 38 ++++++++++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2284f5c8d..5fca38403 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -92,6 +92,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Akkoma API is now documented - ability to auto-approve follow requests from users you are already following - The SimplePolicy MRF can now strip user backgrounds from selected remote hosts +- New standalone `prune_orphaned_activities` mix task with configurable batch limit ## Changed - OTP builds are now built on erlang OTP26 diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index 3d7424d1c..eba56da10 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -53,6 +53,28 @@ This will prune remote posts older than 90 days (configurable with [`config :ple - `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size. - `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning. +## Prune orphaned activities from the database + +This will prune activities which are no longer referenced by anything. +Such activities might be the result of running `prune_objects` without `--prune-orphaned-activities`. +The same notes and warnings apply as for `prune_objects`. + +=== "OTP" + + ```sh + ./bin/pleroma_ctl database prune_orphaned_activities [option ...] + ``` + +=== "From Source" + + ```sh + mix pleroma.database prune_orphaned_activities [option ...] + ``` + +### Options + +- `--limit n` - Only delete up to `n` activities in each query making up this job, i.e. if this job runs two queries at most `2n` activities will be deleted. Running this task repeatedly in limited batches can help maintain the instance’s responsiveness while still freeing up some space. + ## Create a conversation for all existing DMs Can be safely re-run diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 20d035dfd..8ded7bbec 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -20,7 +20,14 @@ defmodule Mix.Tasks.Pleroma.Database do @shortdoc "A collection of database related tasks" @moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md") - def prune_orphaned_activities() do + def prune_orphaned_activities(limit \\ 0) when is_number(limit) do + limit_arg = + if limit > 0 do + "LIMIT #{limit}" + else + "" + end + # Prune activities who link to a single object """ delete from public.activities @@ -34,6 +41,7 @@ def prune_orphaned_activities() do and o.id is null and a2.id is null and u.id is null + #{limit_arg} ) """ |> Repo.query([], timeout: :infinity) @@ -51,6 +59,7 @@ def prune_orphaned_activities() do having max(o.data ->> 'id') is null and max(a2.data ->> 'id') is null and max(u.ap_id) is null + #{limit_arg} ) """ |> Repo.query([], timeout: :infinity) @@ -98,6 +107,33 @@ def run(["update_users_following_followers_counts"]) do ) end + def run(["prune_orphaned_activities" | args]) do + {options, [], []} = + OptionParser.parse( + args, + strict: [ + limit: :integer + ] + ) + + start_pleroma() + + limit = Keyword.get(options, :limit, 0) + + log_message = "Pruning orphaned activities" + + log_message = + if limit > 0 do + log_message <> ", limiting deletion to #{limit} rows" + else + log_message + end + + Logger.info(log_message) + + prune_orphaned_activities(limit) + end + def run(["prune_objects" | args]) do {options, [], []} = OptionParser.parse( From 3258842d0cd9f3e7606f75b27de366983dc2fd8d Mon Sep 17 00:00:00 2001 From: Oneric Date: Mon, 23 Oct 2023 01:27:56 +0200 Subject: [PATCH 03/10] Log number of deleted rows in prune_orphaned_activities This gives feedback when to stop rerunning limited batches. Most of the diff is just adjusting indentation; best reviewed with whitespace-only changes hidden, e.g. `git diff -w`. --- .../docs/administration/CLI_tasks/database.md | 5 ++ lib/mix/tasks/pleroma/database.ex | 72 ++++++++++--------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index eba56da10..c57817bf4 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -59,6 +59,11 @@ This will prune activities which are no longer referenced by anything. Such activities might be the result of running `prune_objects` without `--prune-orphaned-activities`. The same notes and warnings apply as for `prune_objects`. +The task will print out how many rows were freed in total in its last +line of output in the form `Deleted 345 rows`. +When running the job in limited batches this can be used to determine +when all orphaned activities have been deleted. + === "OTP" ```sh diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 8ded7bbec..083f73fe2 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -29,40 +29,44 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do end # Prune activities who link to a single object - """ - delete from public.activities - where id in ( - select a.id from public.activities a - left join public.objects o on a.data ->> 'object' = o.data ->> 'id' - left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' - left join public.users u on a.data ->> 'object' = u.ap_id - where not a.local - and jsonb_typeof(a."data" -> 'object') = 'string' - and o.id is null - and a2.id is null - and u.id is null - #{limit_arg} - ) - """ - |> Repo.query([], timeout: :infinity) + {:ok, %{:num_rows => del_single}} = + """ + delete from public.activities + where id in ( + select a.id from public.activities a + left join public.objects o on a.data ->> 'object' = o.data ->> 'id' + left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' + left join public.users u on a.data ->> 'object' = u.ap_id + where not a.local + and jsonb_typeof(a."data" -> 'object') = 'string' + and o.id is null + and a2.id is null + and u.id is null + #{limit_arg} + ) + """ + |> Repo.query([], timeout: :infinity) # Prune activities who link to an array of objects - """ - delete from public.activities - where id in ( - select a.id from public.activities a - join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' - left join public.objects o on j.value = o.data ->> 'id' - left join public.activities a2 on j.value = a2.data ->> 'id' - left join public.users u on j.value = u.ap_id - group by a.id - having max(o.data ->> 'id') is null - and max(a2.data ->> 'id') is null - and max(u.ap_id) is null - #{limit_arg} - ) - """ - |> Repo.query([], timeout: :infinity) + {:ok, %{:num_rows => del_array}} = + """ + delete from public.activities + where id in ( + select a.id from public.activities a + join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' + left join public.objects o on j.value = o.data ->> 'id' + left join public.activities a2 on j.value = a2.data ->> 'id' + left join public.users u on j.value = u.ap_id + group by a.id + having max(o.data ->> 'id') is null + and max(a2.data ->> 'id') is null + and max(u.ap_id) is null + #{limit_arg} + ) + """ + |> Repo.query([], timeout: :infinity) + + del_single + del_array end def run(["remove_embedded_objects" | args]) do @@ -131,7 +135,9 @@ def run(["prune_orphaned_activities" | args]) do Logger.info(log_message) - prune_orphaned_activities(limit) + deleted = prune_orphaned_activities(limit) + + Logger.info("Deleted #{deleted} rows") end def run(["prune_objects" | args]) do From b03947917a880b84759f33a48d585b84773bcfec Mon Sep 17 00:00:00 2001 From: Oneric Date: Mon, 23 Oct 2023 17:29:02 +0200 Subject: [PATCH 04/10] Also allow limiting the initial prune_object May sometimes be helpful to get more predictable runtime than just with an age-based limit. The subquery for the non-keep-threads path is required since delte_all does not directly accept limit(). Again most of the diff is just adjusting indentation, best hide whitespace-only changes with git diff -w or similar. --- CHANGELOG.md | 1 + .../docs/administration/CLI_tasks/database.md | 1 + lib/mix/tasks/pleroma/database.ex | 61 +++++++++++++------ 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fca38403..19f1b2a3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - ability to auto-approve follow requests from users you are already following - The SimplePolicy MRF can now strip user backgrounds from selected remote hosts - New standalone `prune_orphaned_activities` mix task with configurable batch limit +- The `prune_objects` mix task now accepts a `--limit` parameter for initial object pruning ## Changed - OTP builds are now built on erlang OTP26 diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index c57817bf4..bbf29fc60 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -50,6 +50,7 @@ This will prune remote posts older than 90 days (configurable with [`config :ple - `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity). - `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote. +- `--limit` - limits how many remote posts get pruned. This limit does **not** apply to any of the follow up jobs. If wanting to keep the database load in check it is thus advisable to run the standalone `prune_orphaned_activities` task with a limit afterwards instead of passing `--prune-orphaned-activities` to this task. - `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size. - `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning. diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 083f73fe2..b8f19551a 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -20,6 +20,14 @@ defmodule Mix.Tasks.Pleroma.Database do @shortdoc "A collection of database related tasks" @moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md") + defp maybe_limit(query, limit_cnt) do + if is_number(limit_cnt) and limit_cnt > 0 do + limit(query, [], ^limit_cnt) + else + query + end + end + def prune_orphaned_activities(limit \\ 0) when is_number(limit) do limit_arg = if limit > 0 do @@ -148,7 +156,8 @@ def run(["prune_objects" | args]) do vacuum: :boolean, keep_threads: :boolean, keep_non_public: :boolean, - prune_orphaned_activities: :boolean + prune_orphaned_activities: :boolean, + limit: :integer ] ) @@ -157,6 +166,8 @@ def run(["prune_objects" | args]) do deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) time_deadline = NaiveDateTime.utc_now() |> NaiveDateTime.add(-(deadline * 86_400)) + limit_cnt = Keyword.get(options, :limit, 0) + log_message = "Pruning objects older than #{deadline} days" log_message = @@ -188,6 +199,13 @@ def run(["prune_objects" | args]) do log_message end + log_message = + if limit_cnt > 0 do + log_message <> ", limiting to #{limit_cnt} rows" + else + log_message + end + Logger.info(log_message) if Keyword.get(options, :keep_threads) do @@ -221,31 +239,38 @@ def run(["prune_objects" | args]) do |> having([a], max(a.updated_at) < ^time_deadline) |> having([a], not fragment("bool_or(?)", a.local)) |> having([_, b], fragment("max(?::text) is null", b.id)) + |> maybe_limit(limit_cnt) |> select([a], fragment("? ->> 'context'::text", a.data)) Pleroma.Object |> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context)) else - if Keyword.get(options, :keep_non_public) do - Pleroma.Object + deletable = + if Keyword.get(options, :keep_non_public) do + Pleroma.Object + |> where( + [o], + fragment( + "?->'to' \\? ? OR ?->'cc' \\? ?", + o.data, + ^Pleroma.Constants.as_public(), + o.data, + ^Pleroma.Constants.as_public() + ) + ) + else + Pleroma.Object + end + |> where([o], o.updated_at < ^time_deadline) |> where( [o], - fragment( - "?->'to' \\? ? OR ?->'cc' \\? ?", - o.data, - ^Pleroma.Constants.as_public(), - o.data, - ^Pleroma.Constants.as_public() - ) + fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) ) - else - Pleroma.Object - end - |> where([o], o.updated_at < ^time_deadline) - |> where( - [o], - fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) - ) + |> maybe_limit(limit_cnt) + |> select([o], o.id) + + Pleroma.Object + |> where([o], o.id in subquery(deletable)) end |> Repo.delete_all(timeout: :infinity) From 1caac640da10896bd8e44b7157fad47e8ae42a32 Mon Sep 17 00:00:00 2001 From: Oneric Date: Sat, 10 Feb 2024 03:03:13 +0100 Subject: [PATCH 05/10] Test both standalone and flag mode for pruning orphaned activities --- test/mix/tasks/pleroma/database_test.exs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 97fa830ff..cbb40f3e8 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -470,7 +470,7 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do assert length(activities) == 4 end - test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do + test "it prunes orphaned activities with prune_orphaned_activities when the objects are referenced from an array" do %Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert() %User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert() @@ -517,7 +517,7 @@ test "it prunes orphaned activities with the --prune-orphaned-activities when th assert length(Repo.all(Activity)) == 4 Mix.Tasks.Pleroma.Database.run(["prune_objects"]) assert length(Repo.all(Activity)) == 4 - Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"]) + Mix.Tasks.Pleroma.Database.run(["prune_orphaned_activities"]) activities = Repo.all(Activity) assert length(activities) == 3 From 7e038868864d529cfdd736a08bd54bffc3121d0a Mon Sep 17 00:00:00 2001 From: Oneric Date: Wed, 15 May 2024 01:20:27 +0200 Subject: [PATCH 06/10] dbprune: shortcut array activity search This brought down query costs from 7,953,740.90 to 47,600.97 --- lib/mix/tasks/pleroma/database.ex | 16 +++++++++++++++- test/mix/tasks/pleroma/database_test.exs | 4 ++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index b8f19551a..72364dc6f 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -36,6 +36,19 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do "" end + # Activities can either refer to a single object id, and array of object ids + # or contain an inlined object (at least after going through our normalisation) + # + # Flag is the only type we support with an array (and always has arrays). + # Update the only one with inlined objects, but old Update activities are + # + # We already regularly purge old Delte, Undo, Update and Remove and if + # rejected Follow requests anyway; no need to explicitly deal with those here. + # + # Since there’s an index on types and there are typically only few Flag + # activites, it’s _much_ faster to utilise the index. To avoid accidentally + # deleting useful activities should more types be added, keep typeof for singles. + # Prune activities who link to a single object {:ok, %{:num_rows => del_single}} = """ @@ -61,7 +74,8 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do delete from public.activities where id in ( select a.id from public.activities a - join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' + join json_array_elements_text((a."data" -> 'object')::json) as j + on a.data->>'type' = 'Flag' left join public.objects o on j.value = o.data ->> 'id' left join public.activities a2 on j.value = a2.data ->> 'id' left join public.users u on j.value = u.ap_id diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index cbb40f3e8..c9163e42f 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -478,6 +478,7 @@ test "it prunes orphaned activities with prune_orphaned_activities when the obje |> Map.merge(%{ local: false, data: %{ + "type" => "Flag", "id" => "remote_activity_existing_object", "object" => ["non_ existing_object", "existing_object"] } @@ -488,6 +489,7 @@ test "it prunes orphaned activities with prune_orphaned_activities when the obje |> Map.merge(%{ local: false, data: %{ + "type" => "Flag", "id" => "remote_activity_existing_actor", "object" => ["non_ existing_object", "existing_actor"] } @@ -498,6 +500,7 @@ test "it prunes orphaned activities with prune_orphaned_activities when the obje |> Map.merge(%{ local: false, data: %{ + "type" => "Flag", "id" => "remote_activity_existing_activity", "object" => ["non_ existing_object", "remote_activity_existing_actor"] } @@ -508,6 +511,7 @@ test "it prunes orphaned activities with prune_orphaned_activities when the obje |> Map.merge(%{ local: false, data: %{ + "type" => "Flag", "id" => "remote_activity_without_existing_referenced_object", "object" => ["owo", "whats_this"] } From 91e4f4f885bca86e5fe9b27ea620f4937d4133d0 Mon Sep 17 00:00:00 2001 From: Oneric Date: Wed, 15 May 2024 01:33:41 +0200 Subject: [PATCH 07/10] dbprune: add more logs Pruning can go on for a long time; give admins some insight into that something is happening to make it less frustrating and to make it easier which part of the process is stalled should this happen. Again most of the changes are merely reindents; review with whitespace changes hidden recommended. --- lib/mix/tasks/pleroma/database.ex | 171 ++++++++++++++++-------------- 1 file changed, 94 insertions(+), 77 deletions(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 72364dc6f..561a70291 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -68,6 +68,8 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do """ |> Repo.query([], timeout: :infinity) + Logger.info("Prune activity singles: deteleted #{del_single} rows...") + # Prune activities who link to an array of objects {:ok, %{:num_rows => del_array}} = """ @@ -88,6 +90,8 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do """ |> Repo.query([], timeout: :infinity) + Logger.info("Prune activity arrays: deteleted #{del_array} rows...") + del_single + del_array end @@ -222,102 +226,115 @@ def run(["prune_objects" | args]) do Logger.info(log_message) - if Keyword.get(options, :keep_threads) do - # We want to delete objects from threads where - # 1. the newest post is still old - # 2. none of the activities is local - # 3. none of the activities is bookmarked - # 4. optionally none of the posts is non-public - deletable_context = - if Keyword.get(options, :keep_non_public) do - Pleroma.Activity - |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) - |> group_by([a], fragment("? ->> 'context'::text", a.data)) - |> having( - [a], - not fragment( - # Posts (checked on Create Activity) is non-public - "bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')", - a.data, - ^Pleroma.Constants.as_public(), - a.data, - ^Pleroma.Constants.as_public(), - a.data + {del_obj, _} = + if Keyword.get(options, :keep_threads) do + # We want to delete objects from threads where + # 1. the newest post is still old + # 2. none of the activities is local + # 3. none of the activities is bookmarked + # 4. optionally none of the posts is non-public + deletable_context = + if Keyword.get(options, :keep_non_public) do + Pleroma.Activity + |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) + |> group_by([a], fragment("? ->> 'context'::text", a.data)) + |> having( + [a], + not fragment( + # Posts (checked on Create Activity) is non-public + "bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')", + a.data, + ^Pleroma.Constants.as_public(), + a.data, + ^Pleroma.Constants.as_public(), + a.data + ) ) - ) - else - Pleroma.Activity - |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) - |> group_by([a], fragment("? ->> 'context'::text", a.data)) - end - |> having([a], max(a.updated_at) < ^time_deadline) - |> having([a], not fragment("bool_or(?)", a.local)) - |> having([_, b], fragment("max(?::text) is null", b.id)) - |> maybe_limit(limit_cnt) - |> select([a], fragment("? ->> 'context'::text", a.data)) + else + Pleroma.Activity + |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) + |> group_by([a], fragment("? ->> 'context'::text", a.data)) + end + |> having([a], max(a.updated_at) < ^time_deadline) + |> having([a], not fragment("bool_or(?)", a.local)) + |> having([_, b], fragment("max(?::text) is null", b.id)) + |> maybe_limit(limit_cnt) + |> select([a], fragment("? ->> 'context'::text", a.data)) - Pleroma.Object - |> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context)) - else - deletable = - if Keyword.get(options, :keep_non_public) do - Pleroma.Object + Pleroma.Object + |> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context)) + else + deletable = + if Keyword.get(options, :keep_non_public) do + Pleroma.Object + |> where( + [o], + fragment( + "?->'to' \\? ? OR ?->'cc' \\? ?", + o.data, + ^Pleroma.Constants.as_public(), + o.data, + ^Pleroma.Constants.as_public() + ) + ) + else + Pleroma.Object + end + |> where([o], o.updated_at < ^time_deadline) |> where( [o], - fragment( - "?->'to' \\? ? OR ?->'cc' \\? ?", - o.data, - ^Pleroma.Constants.as_public(), - o.data, - ^Pleroma.Constants.as_public() - ) + fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) ) - else - Pleroma.Object - end - |> where([o], o.updated_at < ^time_deadline) - |> where( - [o], - fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) - ) - |> maybe_limit(limit_cnt) - |> select([o], o.id) + |> maybe_limit(limit_cnt) + |> select([o], o.id) - Pleroma.Object - |> where([o], o.id in subquery(deletable)) - end - |> Repo.delete_all(timeout: :infinity) + Pleroma.Object + |> where([o], o.id in subquery(deletable)) + end + |> Repo.delete_all(timeout: :infinity) + + Logger.info("Deleted #{del_obj} objects...") if !Keyword.get(options, :keep_threads) do # Without the --keep-threads option, it's possible that bookmarked # objects have been deleted. We remove the corresponding bookmarks. - """ - delete from public.bookmarks - where id in ( - select b.id from public.bookmarks b - left join public.activities a on b.activity_id = a.id - left join public.objects o on a."data" ->> 'object' = o.data ->> 'id' - where o.id is null - ) - """ - |> Repo.query([], timeout: :infinity) + {:ok, %{:num_rows => del_bookmarks}} = + """ + delete from public.bookmarks + where id in ( + select b.id from public.bookmarks b + left join public.activities a on b.activity_id = a.id + left join public.objects o on a."data" ->> 'object' = o.data ->> 'id' + where o.id is null + ) + """ + |> Repo.query([], timeout: :infinity) + + Logger.info("Deleted #{del_bookmarks} orphaned bookmarks...") end if Keyword.get(options, :prune_orphaned_activities) do - prune_orphaned_activities() + del_activities = prune_orphaned_activities() + Logger.info("Deleted #{del_activities} orphaned activities...") end - """ - DELETE FROM hashtags AS ht - WHERE NOT EXISTS ( - SELECT 1 FROM hashtags_objects hto - WHERE ht.id = hto.hashtag_id) - """ - |> Repo.query() + {:ok, %{:num_rows => del_hashtags}} = + """ + DELETE FROM hashtags AS ht + WHERE NOT EXISTS ( + SELECT 1 FROM hashtags_objects hto + WHERE ht.id = hto.hashtag_id) + """ + |> Repo.query() + + Logger.info("Deleted #{del_hashtags} no longer used hashtags...") if Keyword.get(options, :vacuum) do + Logger.info("Starting vacuum...") Maintenance.vacuum("full") end + + Logger.info("All done!") end def run(["prune_task"]) do From 3c319ea732795cbe5a5995c36ac4a09535a16397 Mon Sep 17 00:00:00 2001 From: Oneric Date: Wed, 15 May 2024 01:38:59 +0200 Subject: [PATCH 08/10] dbprune: use query! --- lib/mix/tasks/pleroma/database.ex | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 561a70291..76f6c5a03 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -50,7 +50,7 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do # deleting useful activities should more types be added, keep typeof for singles. # Prune activities who link to a single object - {:ok, %{:num_rows => del_single}} = + %{:num_rows => del_single} = """ delete from public.activities where id in ( @@ -66,12 +66,12 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do #{limit_arg} ) """ - |> Repo.query([], timeout: :infinity) + |> Repo.query!([], timeout: :infinity) Logger.info("Prune activity singles: deteleted #{del_single} rows...") # Prune activities who link to an array of objects - {:ok, %{:num_rows => del_array}} = + %{:num_rows => del_array} = """ delete from public.activities where id in ( @@ -88,7 +88,7 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do #{limit_arg} ) """ - |> Repo.query([], timeout: :infinity) + |> Repo.query!([], timeout: :infinity) Logger.info("Prune activity arrays: deteleted #{del_array} rows...") @@ -298,7 +298,7 @@ def run(["prune_objects" | args]) do if !Keyword.get(options, :keep_threads) do # Without the --keep-threads option, it's possible that bookmarked # objects have been deleted. We remove the corresponding bookmarks. - {:ok, %{:num_rows => del_bookmarks}} = + %{:num_rows => del_bookmarks} = """ delete from public.bookmarks where id in ( @@ -308,7 +308,7 @@ def run(["prune_objects" | args]) do where o.id is null ) """ - |> Repo.query([], timeout: :infinity) + |> Repo.query!([], timeout: :infinity) Logger.info("Deleted #{del_bookmarks} orphaned bookmarks...") end @@ -318,14 +318,14 @@ def run(["prune_objects" | args]) do Logger.info("Deleted #{del_activities} orphaned activities...") end - {:ok, %{:num_rows => del_hashtags}} = + %{:num_rows => del_hashtags} = """ DELETE FROM hashtags AS ht WHERE NOT EXISTS ( SELECT 1 FROM hashtags_objects hto WHERE ht.id = hto.hashtag_id) """ - |> Repo.query() + |> Repo.query!() Logger.info("Deleted #{del_hashtags} no longer used hashtags...") From 40ae91a45c3d6dba5f35afccd5be06a9ffd9ae31 Mon Sep 17 00:00:00 2001 From: Oneric Date: Wed, 15 May 2024 02:15:31 +0200 Subject: [PATCH 09/10] dbprune: allow splitting array and single activity prunes The former is typically just a few reports; it doesn't make sense to rerun it over and over again in batched prunes or if a full prune OOMed. --- .../docs/administration/CLI_tasks/database.md | 2 + lib/mix/tasks/pleroma/database.ex | 76 ++++++++++++------- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index bbf29fc60..580c9d32b 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -80,6 +80,8 @@ when all orphaned activities have been deleted. ### Options - `--limit n` - Only delete up to `n` activities in each query making up this job, i.e. if this job runs two queries at most `2n` activities will be deleted. Running this task repeatedly in limited batches can help maintain the instance’s responsiveness while still freeing up some space. +- `--no-singles` - Do not delete activites referencing single objects +- `--no-arrays` - Do not delete activites referencing an array of objects ## Create a conversation for all existing DMs diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 76f6c5a03..8d58acae2 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -28,28 +28,16 @@ defp maybe_limit(query, limit_cnt) do end end - def prune_orphaned_activities(limit \\ 0) when is_number(limit) do - limit_arg = - if limit > 0 do - "LIMIT #{limit}" - else - "" - end + defp limit_statement(limit) when is_number(limit) do + if limit > 0 do + "LIMIT #{limit}" + else + "" + end + end - # Activities can either refer to a single object id, and array of object ids - # or contain an inlined object (at least after going through our normalisation) - # - # Flag is the only type we support with an array (and always has arrays). - # Update the only one with inlined objects, but old Update activities are - # - # We already regularly purge old Delte, Undo, Update and Remove and if - # rejected Follow requests anyway; no need to explicitly deal with those here. - # - # Since there’s an index on types and there are typically only few Flag - # activites, it’s _much_ faster to utilise the index. To avoid accidentally - # deleting useful activities should more types be added, keep typeof for singles. - # Prune activities who link to a single object + defp prune_orphaned_activities_singles(limit) do %{:num_rows => del_single} = """ delete from public.activities @@ -63,14 +51,16 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do and o.id is null and a2.id is null and u.id is null - #{limit_arg} + #{limit_statement(limit)} ) """ |> Repo.query!([], timeout: :infinity) Logger.info("Prune activity singles: deteleted #{del_single} rows...") + del_single + end - # Prune activities who link to an array of objects + defp prune_orphaned_activities_array(limit) do %{:num_rows => del_array} = """ delete from public.activities @@ -85,12 +75,44 @@ def prune_orphaned_activities(limit \\ 0) when is_number(limit) do having max(o.data ->> 'id') is null and max(a2.data ->> 'id') is null and max(u.ap_id) is null - #{limit_arg} + #{limit_statement(limit)} ) """ |> Repo.query!([], timeout: :infinity) Logger.info("Prune activity arrays: deteleted #{del_array} rows...") + del_array + end + + def prune_orphaned_activities(limit \\ 0, opts \\ []) when is_number(limit) do + # Activities can either refer to a single object id, and array of object ids + # or contain an inlined object (at least after going through our normalisation) + # + # Flag is the only type we support with an array (and always has arrays). + # Update the only one with inlined objects, but old Update activities are + # + # We already regularly purge old Delte, Undo, Update and Remove and if + # rejected Follow requests anyway; no need to explicitly deal with those here. + # + # Since there’s an index on types and there are typically only few Flag + # activites, it’s _much_ faster to utilise the index. To avoid accidentally + # deleting useful activities should more types be added, keep typeof for singles. + + # Prune activities who link to a single object + del_single = + if Keyword.get(opts, :singles, true) do + prune_orphaned_activities_singles(limit) + else + 0 + end + + # Prune activities who link to an array of objects + del_array = + if Keyword.get(opts, :arrays, true) do + prune_orphaned_activities_array(limit) + else + 0 + end del_single + del_array end @@ -142,13 +164,15 @@ def run(["prune_orphaned_activities" | args]) do OptionParser.parse( args, strict: [ - limit: :integer + limit: :integer, + singles: :boolean, + arrays: :boolean, ] ) start_pleroma() - limit = Keyword.get(options, :limit, 0) + {limit, options} = Keyword.pop(options, :limit, 0) log_message = "Pruning orphaned activities" @@ -161,7 +185,7 @@ def run(["prune_orphaned_activities" | args]) do Logger.info(log_message) - deleted = prune_orphaned_activities(limit) + deleted = prune_orphaned_activities(limit, options) Logger.info("Deleted #{deleted} rows") end From c127d483083791ae4f7540650d7b51be1f680f16 Mon Sep 17 00:00:00 2001 From: Oneric Date: Wed, 15 May 2024 02:17:34 +0200 Subject: [PATCH 10/10] dbprune/activites: prune array activities first This query is less costly; if something goes wrong or gets aborted later at least this part will arelady be done. --- lib/mix/tasks/pleroma/database.ex | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 8d58acae2..bd545d617 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -98,14 +98,6 @@ def prune_orphaned_activities(limit \\ 0, opts \\ []) when is_number(limit) do # activites, it’s _much_ faster to utilise the index. To avoid accidentally # deleting useful activities should more types be added, keep typeof for singles. - # Prune activities who link to a single object - del_single = - if Keyword.get(opts, :singles, true) do - prune_orphaned_activities_singles(limit) - else - 0 - end - # Prune activities who link to an array of objects del_array = if Keyword.get(opts, :arrays, true) do @@ -114,6 +106,14 @@ def prune_orphaned_activities(limit \\ 0, opts \\ []) when is_number(limit) do 0 end + # Prune activities who link to a single object + del_single = + if Keyword.get(opts, :singles, true) do + prune_orphaned_activities_singles(limit) + else + 0 + end + del_single + del_array end