diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e638bdd8..c3e88f071 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Admin scopes will be dropped on create - Rich media will now backoff for 20 minutes after a failure - Quote posts are now considered as part of the same thread as the post they are quoting +- Extend the mix task `prune_objects` with options to keep more relevant posts - Simplified HTTP signature processing - Rich media will now hard-exit after 5 seconds, to prevent timeline hangs - HTTP Content Security Policy is now far more strict to prevent any potential XSS/CSS leakages diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index 73419dc81..915139cf7 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -27,7 +27,7 @@ Replaces embedded objects with references to them in the `objects` table. Only n ## Prune old remote posts from the database -This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database, they will be refetched from source when accessed. +This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database. Pruned posts may be refetched in some cases. !!! danger The disk space will only be reclaimed after `VACUUM FULL`. You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. @@ -45,6 +45,9 @@ This will prune remote posts older than 90 days (configurable with [`config :ple ``` ### Options + +- `--keep-threads` - don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...) +- `--keep-non-public` - keep non-public posts like DM's and followers-only, even if they are remote - `--vacuum` - run `VACUUM FULL` after the objects are pruned ## Create a conversation for all existing DMs @@ -178,4 +181,4 @@ to the current day. ```sh mix pleroma.database prune_task - ``` \ No newline at end of file + ``` diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 272c9e3e5..be59e2271 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -67,33 +67,92 @@ def run(["prune_objects" | args]) do OptionParser.parse( args, strict: [ - vacuum: :boolean + vacuum: :boolean, + keep_threads: :boolean, + keep_non_public: :boolean ] ) start_pleroma() deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + time_deadline = NaiveDateTime.utc_now() |> NaiveDateTime.add(-(deadline * 86_400)) - Logger.info("Pruning objects older than #{deadline} days") + log_message = "Pruning objects older than #{deadline} days" - time_deadline = - NaiveDateTime.utc_now() - |> NaiveDateTime.add(-(deadline * 86_400)) + log_message = + if Keyword.get(options, :keep_non_public) do + log_message <> ", keeping non public posts" + else + log_message + end - from(o in Object, - where: - fragment( - "?->'to' \\? ? OR ?->'cc' \\? ?", - o.data, - ^Pleroma.Constants.as_public(), - o.data, - ^Pleroma.Constants.as_public() - ), - where: o.inserted_at < ^time_deadline, - where: + log_message = + if Keyword.get(options, :keep_threads) do + log_message <> ", keeping threads intact" + else + log_message + end + + Logger.info(log_message) + + if Keyword.get(options, :keep_threads) do + # We want to delete objects from threads where + # 1. the newest post is still old + # 2. none of the activities is local + # 3. none of the activities is bookmarked + # 4. optionally none of the posts is non-public + deletable_context = + if Keyword.get(options, :keep_non_public) do + Pleroma.Activity + |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) + |> group_by([a], fragment("? ->> 'context'::text", a.data)) + |> having( + [a], + not fragment( + # Posts (checked on Create Activity) is non-public + "bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')", + a.data, + ^Pleroma.Constants.as_public(), + a.data, + ^Pleroma.Constants.as_public(), + a.data + ) + ) + else + Pleroma.Activity + |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) + |> group_by([a], fragment("? ->> 'context'::text", a.data)) + end + |> having([a], max(a.updated_at) < ^time_deadline) + |> having([a], not fragment("bool_or(?)", a.local)) + |> having([_, b], fragment("max(?::text) is null", b.id)) + |> select([a], fragment("? ->> 'context'::text", a.data)) + + Pleroma.Object + |> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context)) + else + if Keyword.get(options, :keep_non_public) do + Pleroma.Object + |> where( + [o], + fragment( + "?->'to' \\? ? OR ?->'cc' \\? ?", + o.data, + ^Pleroma.Constants.as_public(), + o.data, + ^Pleroma.Constants.as_public() + ) + ) + else + Pleroma.Object + end + |> where([o], o.updated_at < ^time_deadline) + |> where( + [o], fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) - ) + ) + end |> Repo.delete_all(timeout: :infinity) prune_hashtags_query = """ diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 7a1a759da..447a4404e 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -46,7 +46,6 @@ test "it replaces objects with references" do describe "prune_objects" do test "it prunes old objects from the database" do - insert(:note) deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1 date = @@ -55,18 +54,304 @@ test "it prunes old objects from the database" do |> Timex.to_naive_datetime() |> NaiveDateTime.truncate(:second) - %{id: id} = + insert(:note) + + %{id: note_remote_public_id} = :note |> insert() - |> Ecto.Changeset.change(%{inserted_at: date}) + |> Ecto.Changeset.change(%{updated_at: date}) |> Repo.update!() - assert length(Repo.all(Object)) == 2 + note_remote_non_public = + %{id: note_remote_non_public_id, data: note_remote_non_public_data} = + :note + |> insert() + + note_remote_non_public + |> Ecto.Changeset.change(%{ + updated_at: date, + data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end) + }) + |> Repo.update!() + + assert length(Repo.all(Object)) == 3 Mix.Tasks.Pleroma.Database.run(["prune_objects"]) assert length(Repo.all(Object)) == 1 - refute Object.get_by_id(id) + refute Object.get_by_id(note_remote_public_id) + refute Object.get_by_id(note_remote_non_public_id) + end + + test "with the --keep-non-public option it still keeps non-public posts even if they are not local" do + deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1 + + date = + Timex.now() + |> Timex.shift(days: -deadline) + |> Timex.to_naive_datetime() + |> NaiveDateTime.truncate(:second) + + insert(:note) + + %{id: note_remote_id} = + :note + |> insert() + |> Ecto.Changeset.change(%{updated_at: date}) + |> Repo.update!() + + note_remote_non_public = + %{data: note_remote_non_public_data} = + :note + |> insert() + + note_remote_non_public + |> Ecto.Changeset.change(%{ + updated_at: date, + data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end) + }) + |> Repo.update!() + + assert length(Repo.all(Object)) == 3 + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-non-public"]) + + assert length(Repo.all(Object)) == 2 + refute Object.get_by_id(note_remote_id) + end + + test "with the --keep-threads and --keep-non-public option it keeps old threads with non-public replies even if the interaction is not local" do + # For non-public we only check Create Activities because only these are relevant for threads + # Flags are always non-public, Announces from relays can be non-public... + deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1 + + old_insert_date = + Timex.now() + |> Timex.shift(days: -deadline) + |> Timex.to_naive_datetime() + |> NaiveDateTime.truncate(:second) + + remote_user1 = insert(:user, local: false) + remote_user2 = insert(:user, local: false) + + # Old remote non-public reply (should be kept) + {:ok, old_remote_post1_activity} = + CommonAPI.post(remote_user1, %{status: "some thing", local: false}) + + old_remote_post1_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_remote_non_public_reply_activity} = + CommonAPI.post(remote_user2, %{ + status: "some reply", + in_reply_to_status_id: old_remote_post1_activity.id + }) + + old_remote_non_public_reply_activity + |> Ecto.Changeset.change(%{ + local: false, + updated_at: old_insert_date, + data: old_remote_non_public_reply_activity.data |> update_in(["to"], fn _ -> [] end) + }) + |> Repo.update!() + + # Old remote non-public Announce (should be removed) + {:ok, old_remote_post2_activity = %{data: %{"object" => old_remote_post2_id}}} = + CommonAPI.post(remote_user1, %{status: "some thing", local: false}) + + old_remote_post2_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_remote_non_public_repeat_activity} = + CommonAPI.repeat(old_remote_post2_activity.id, remote_user2) + + old_remote_non_public_repeat_activity + |> Ecto.Changeset.change(%{ + local: false, + updated_at: old_insert_date, + data: old_remote_non_public_repeat_activity.data |> update_in(["to"], fn _ -> [] end) + }) + |> Repo.update!() + + assert length(Repo.all(Object)) == 3 + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads", "--keep-non-public"]) + + Repo.all(Pleroma.Activity) + assert length(Repo.all(Object)) == 2 + refute Object.get_by_ap_id(old_remote_post2_id) + end + + test "with the --keep-threads option it still keeps non-old threads even with no local interactions" do + remote_user = insert(:user, local: false) + remote_user2 = insert(:user, local: false) + + {:ok, remote_post_activity} = + CommonAPI.post(remote_user, %{status: "some thing", local: false}) + + {:ok, remote_post_reply_activity} = + CommonAPI.post(remote_user2, %{ + status: "some reply", + in_reply_to_status_id: remote_post_activity.id + }) + + remote_post_activity + |> Ecto.Changeset.change(%{local: false}) + |> Repo.update!() + + remote_post_reply_activity + |> Ecto.Changeset.change(%{local: false}) + |> Repo.update!() + + assert length(Repo.all(Object)) == 2 + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"]) + + assert length(Repo.all(Object)) == 2 + end + + test "with the --keep-threads option it deletes old threads with no local interaction" do + deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1 + + old_insert_date = + Timex.now() + |> Timex.shift(days: -deadline) + |> Timex.to_naive_datetime() + |> NaiveDateTime.truncate(:second) + + remote_user = insert(:user, local: false) + remote_user2 = insert(:user, local: false) + + {:ok, old_remote_post_activity} = + CommonAPI.post(remote_user, %{status: "some thing", local: false}) + + old_remote_post_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_remote_post_reply_activity} = + CommonAPI.post(remote_user2, %{ + status: "some reply", + in_reply_to_status_id: old_remote_post_activity.id + }) + + old_remote_post_reply_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_favourite_activity} = + CommonAPI.favorite(remote_user2, old_remote_post_activity.id) + + old_favourite_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_repeat_activity} = CommonAPI.repeat(old_remote_post_activity.id, remote_user2) + + old_repeat_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + assert length(Repo.all(Object)) == 2 + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"]) + + assert length(Repo.all(Object)) == 0 + end + + test "with the --keep-threads option it keeps old threads with local interaction" do + deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1 + + old_insert_date = + Timex.now() + |> Timex.shift(days: -deadline) + |> Timex.to_naive_datetime() + |> NaiveDateTime.truncate(:second) + + remote_user = insert(:user, local: false) + local_user = insert(:user, local: true) + + # local reply + {:ok, old_remote_post1_activity} = + CommonAPI.post(remote_user, %{status: "some thing", local: false}) + + old_remote_post1_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_local_post2_reply_activity} = + CommonAPI.post(local_user, %{ + status: "some reply", + in_reply_to_status_id: old_remote_post1_activity.id + }) + + old_local_post2_reply_activity + |> Ecto.Changeset.change(%{local: true, updated_at: old_insert_date}) + |> Repo.update!() + + # local Like + {:ok, old_remote_post3_activity} = + CommonAPI.post(remote_user, %{status: "some thing", local: false}) + + old_remote_post3_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_favourite_activity} = CommonAPI.favorite(local_user, old_remote_post3_activity.id) + + old_favourite_activity + |> Ecto.Changeset.change(%{local: true, updated_at: old_insert_date}) + |> Repo.update!() + + # local Announce + {:ok, old_remote_post4_activity} = + CommonAPI.post(remote_user, %{status: "some thing", local: false}) + + old_remote_post4_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_repeat_activity} = CommonAPI.repeat(old_remote_post4_activity.id, local_user) + + old_repeat_activity + |> Ecto.Changeset.change(%{local: true, updated_at: old_insert_date}) + |> Repo.update!() + + assert length(Repo.all(Object)) == 4 + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"]) + + assert length(Repo.all(Object)) == 4 + end + + test "with the --keep-threads option it keeps old threads with bookmarked posts" do + deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1 + + old_insert_date = + Timex.now() + |> Timex.shift(days: -deadline) + |> Timex.to_naive_datetime() + |> NaiveDateTime.truncate(:second) + + remote_user = insert(:user, local: false) + local_user = insert(:user, local: true) + + {:ok, old_remote_post_activity} = + CommonAPI.post(remote_user, %{status: "some thing", local: false}) + + old_remote_post_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + Pleroma.Bookmark.create(local_user.id, old_remote_post_activity.id) + + assert length(Repo.all(Object)) == 1 + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"]) + + assert length(Repo.all(Object)) == 1 end end