Prune Objects --keep-threads option #350

Merged
floatingghost merged 4 commits from ilja/akkoma:prune_objects_whithout_breaking_threads into develop 2023-01-09 22:15:42 +00:00
4 changed files with 372 additions and 24 deletions

View file

@ -27,6 +27,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Admin scopes will be dropped on create - Admin scopes will be dropped on create
- Rich media will now backoff for 20 minutes after a failure - Rich media will now backoff for 20 minutes after a failure
- Quote posts are now considered as part of the same thread as the post they are quoting - Quote posts are now considered as part of the same thread as the post they are quoting
- Extend the mix task `prune_objects` with options to keep more relevant posts
- Simplified HTTP signature processing - Simplified HTTP signature processing
- Rich media will now hard-exit after 5 seconds, to prevent timeline hangs - Rich media will now hard-exit after 5 seconds, to prevent timeline hangs
- HTTP Content Security Policy is now far more strict to prevent any potential XSS/CSS leakages - HTTP Content Security Policy is now far more strict to prevent any potential XSS/CSS leakages

View file

@ -27,7 +27,7 @@ Replaces embedded objects with references to them in the `objects` table. Only n
## Prune old remote posts from the database ## Prune old remote posts from the database
This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database, they will be refetched from source when accessed. This will prune remote posts older than 90 days (configurable with [`config :pleroma, :instance, remote_post_retention_days`](../../configuration/cheatsheet.md#instance)) from the database. Pruned posts may be refetched in some cases.
!!! danger !!! danger
The disk space will only be reclaimed after `VACUUM FULL`. You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free. The disk space will only be reclaimed after `VACUUM FULL`. You may run out of disk space during the execution of the task or vacuuming if you don't have about 1/3rds of the database size free.
@ -45,6 +45,9 @@ This will prune remote posts older than 90 days (configurable with [`config :ple
``` ```
### Options ### Options
- `--keep-threads` - don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...)
- `--keep-non-public` - keep non-public posts like DM's and followers-only, even if they are remote
- `--vacuum` - run `VACUUM FULL` after the objects are pruned - `--vacuum` - run `VACUUM FULL` after the objects are pruned
## Create a conversation for all existing DMs ## Create a conversation for all existing DMs
@ -178,4 +181,4 @@ to the current day.
```sh ```sh
mix pleroma.database prune_task mix pleroma.database prune_task
``` ```

View file

@ -67,33 +67,92 @@ def run(["prune_objects" | args]) do
OptionParser.parse( OptionParser.parse(
args, args,
strict: [ strict: [
vacuum: :boolean vacuum: :boolean,
keep_threads: :boolean,
keep_non_public: :boolean
] ]
) )
start_pleroma() start_pleroma()
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) deadline = Pleroma.Config.get([:instance, :remote_post_retention_days])
time_deadline = NaiveDateTime.utc_now() |> NaiveDateTime.add(-(deadline * 86_400))
Logger.info("Pruning objects older than #{deadline} days") log_message = "Pruning objects older than #{deadline} days"
time_deadline = log_message =
NaiveDateTime.utc_now() if Keyword.get(options, :keep_non_public) do
|> NaiveDateTime.add(-(deadline * 86_400)) log_message <> ", keeping non public posts"
else
log_message
end
from(o in Object, log_message =
where: if Keyword.get(options, :keep_threads) do
fragment( log_message <> ", keeping threads intact"
"?->'to' \\? ? OR ?->'cc' \\? ?", else
o.data, log_message
^Pleroma.Constants.as_public(), end
o.data,
^Pleroma.Constants.as_public() Logger.info(log_message)
),
where: o.inserted_at < ^time_deadline, if Keyword.get(options, :keep_threads) do
where: # We want to delete objects from threads where
# 1. the newest post is still old
# 2. none of the activities is local
# 3. none of the activities is bookmarked
# 4. optionally none of the posts is non-public
deletable_context =
if Keyword.get(options, :keep_non_public) do
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
|> having(
[a],
not fragment(
# Posts (checked on Create Activity) is non-public
"bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')",
a.data,
^Pleroma.Constants.as_public(),
a.data,
^Pleroma.Constants.as_public(),
a.data
)
)
else
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
end
|> having([a], max(a.updated_at) < ^time_deadline)
|> having([a], not fragment("bool_or(?)", a.local))
|> having([_, b], fragment("max(?::text) is null", b.id))
|> select([a], fragment("? ->> 'context'::text", a.data))
Pleroma.Object
|> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context))
else
if Keyword.get(options, :keep_non_public) do
Pleroma.Object
|> where(
[o],
fragment(
"?->'to' \\? ? OR ?->'cc' \\? ?",
o.data,
^Pleroma.Constants.as_public(),
o.data,
^Pleroma.Constants.as_public()
)
)
else
Pleroma.Object
end
|> where([o], o.updated_at < ^time_deadline)
|> where(
[o],
fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host())
) )
end
|> Repo.delete_all(timeout: :infinity) |> Repo.delete_all(timeout: :infinity)
prune_hashtags_query = """ prune_hashtags_query = """

View file

@ -46,7 +46,6 @@ test "it replaces objects with references" do
describe "prune_objects" do describe "prune_objects" do
test "it prunes old objects from the database" do test "it prunes old objects from the database" do
insert(:note)
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1 deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1
date = date =
@ -55,18 +54,304 @@ test "it prunes old objects from the database" do
|> Timex.to_naive_datetime() |> Timex.to_naive_datetime()
|> NaiveDateTime.truncate(:second) |> NaiveDateTime.truncate(:second)
%{id: id} = insert(:note)
%{id: note_remote_public_id} =
:note :note
|> insert() |> insert()
|> Ecto.Changeset.change(%{inserted_at: date}) |> Ecto.Changeset.change(%{updated_at: date})
|> Repo.update!() |> Repo.update!()
assert length(Repo.all(Object)) == 2 note_remote_non_public =
%{id: note_remote_non_public_id, data: note_remote_non_public_data} =
:note
|> insert()
note_remote_non_public
|> Ecto.Changeset.change(%{
updated_at: date,
data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
assert length(Repo.all(Object)) == 3
Mix.Tasks.Pleroma.Database.run(["prune_objects"]) Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Object)) == 1 assert length(Repo.all(Object)) == 1
refute Object.get_by_id(id) refute Object.get_by_id(note_remote_public_id)
refute Object.get_by_id(note_remote_non_public_id)
end
test "with the --keep-non-public option it still keeps non-public posts even if they are not local" do
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1
date =
Timex.now()
|> Timex.shift(days: -deadline)
|> Timex.to_naive_datetime()
|> NaiveDateTime.truncate(:second)
insert(:note)
%{id: note_remote_id} =
:note
|> insert()
|> Ecto.Changeset.change(%{updated_at: date})
|> Repo.update!()
note_remote_non_public =
%{data: note_remote_non_public_data} =
:note
|> insert()
note_remote_non_public
|> Ecto.Changeset.change(%{
updated_at: date,
data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
assert length(Repo.all(Object)) == 3
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-non-public"])
assert length(Repo.all(Object)) == 2
refute Object.get_by_id(note_remote_id)
end
test "with the --keep-threads and --keep-non-public option it keeps old threads with non-public replies even if the interaction is not local" do
# For non-public we only check Create Activities because only these are relevant for threads
# Flags are always non-public, Announces from relays can be non-public...
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1
old_insert_date =
Timex.now()
|> Timex.shift(days: -deadline)
|> Timex.to_naive_datetime()
|> NaiveDateTime.truncate(:second)
remote_user1 = insert(:user, local: false)
remote_user2 = insert(:user, local: false)
# Old remote non-public reply (should be kept)
{:ok, old_remote_post1_activity} =
CommonAPI.post(remote_user1, %{status: "some thing", local: false})
old_remote_post1_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_remote_non_public_reply_activity} =
CommonAPI.post(remote_user2, %{
status: "some reply",
in_reply_to_status_id: old_remote_post1_activity.id
})
old_remote_non_public_reply_activity
|> Ecto.Changeset.change(%{
local: false,
updated_at: old_insert_date,
data: old_remote_non_public_reply_activity.data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
# Old remote non-public Announce (should be removed)
{:ok, old_remote_post2_activity = %{data: %{"object" => old_remote_post2_id}}} =
CommonAPI.post(remote_user1, %{status: "some thing", local: false})
old_remote_post2_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_remote_non_public_repeat_activity} =
CommonAPI.repeat(old_remote_post2_activity.id, remote_user2)
old_remote_non_public_repeat_activity
|> Ecto.Changeset.change(%{
local: false,
updated_at: old_insert_date,
data: old_remote_non_public_repeat_activity.data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
assert length(Repo.all(Object)) == 3
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads", "--keep-non-public"])
Repo.all(Pleroma.Activity)
assert length(Repo.all(Object)) == 2
refute Object.get_by_ap_id(old_remote_post2_id)
end
test "with the --keep-threads option it still keeps non-old threads even with no local interactions" do
remote_user = insert(:user, local: false)
remote_user2 = insert(:user, local: false)
{:ok, remote_post_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
{:ok, remote_post_reply_activity} =
CommonAPI.post(remote_user2, %{
status: "some reply",
in_reply_to_status_id: remote_post_activity.id
})
remote_post_activity
|> Ecto.Changeset.change(%{local: false})
|> Repo.update!()
remote_post_reply_activity
|> Ecto.Changeset.change(%{local: false})
|> Repo.update!()
assert length(Repo.all(Object)) == 2
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"])
assert length(Repo.all(Object)) == 2
end
test "with the --keep-threads option it deletes old threads with no local interaction" do
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1
old_insert_date =
Timex.now()
|> Timex.shift(days: -deadline)
|> Timex.to_naive_datetime()
|> NaiveDateTime.truncate(:second)
remote_user = insert(:user, local: false)
remote_user2 = insert(:user, local: false)
{:ok, old_remote_post_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
old_remote_post_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_remote_post_reply_activity} =
CommonAPI.post(remote_user2, %{
status: "some reply",
in_reply_to_status_id: old_remote_post_activity.id
})
old_remote_post_reply_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_favourite_activity} =
CommonAPI.favorite(remote_user2, old_remote_post_activity.id)
old_favourite_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_repeat_activity} = CommonAPI.repeat(old_remote_post_activity.id, remote_user2)
old_repeat_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
assert length(Repo.all(Object)) == 2
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"])
assert length(Repo.all(Object)) == 0
end
test "with the --keep-threads option it keeps old threads with local interaction" do
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1
old_insert_date =
Timex.now()
|> Timex.shift(days: -deadline)
|> Timex.to_naive_datetime()
|> NaiveDateTime.truncate(:second)
remote_user = insert(:user, local: false)
local_user = insert(:user, local: true)
# local reply
{:ok, old_remote_post1_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
old_remote_post1_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_local_post2_reply_activity} =
CommonAPI.post(local_user, %{
status: "some reply",
in_reply_to_status_id: old_remote_post1_activity.id
})
old_local_post2_reply_activity
|> Ecto.Changeset.change(%{local: true, updated_at: old_insert_date})
|> Repo.update!()
# local Like
{:ok, old_remote_post3_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
old_remote_post3_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_favourite_activity} = CommonAPI.favorite(local_user, old_remote_post3_activity.id)
old_favourite_activity
|> Ecto.Changeset.change(%{local: true, updated_at: old_insert_date})
|> Repo.update!()
# local Announce
{:ok, old_remote_post4_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
old_remote_post4_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_repeat_activity} = CommonAPI.repeat(old_remote_post4_activity.id, local_user)
old_repeat_activity
|> Ecto.Changeset.change(%{local: true, updated_at: old_insert_date})
|> Repo.update!()
assert length(Repo.all(Object)) == 4
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"])
assert length(Repo.all(Object)) == 4
end
test "with the --keep-threads option it keeps old threads with bookmarked posts" do
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) + 1
old_insert_date =
Timex.now()
|> Timex.shift(days: -deadline)
|> Timex.to_naive_datetime()
|> NaiveDateTime.truncate(:second)
remote_user = insert(:user, local: false)
local_user = insert(:user, local: true)
{:ok, old_remote_post_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
old_remote_post_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
Pleroma.Bookmark.create(local_user.id, old_remote_post_activity.id)
assert length(Repo.all(Object)) == 1
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-threads"])
assert length(Repo.all(Object)) == 1
end end
end end