Add backfilling of posts #846

Open
tudbut wants to merge 21 commits from tudbut/akkoma:develop into develop
7 changed files with 70 additions and 11 deletions
Showing only changes of commit 134d395a74 - Show all commits

View file

@ -19,7 +19,7 @@ def cast(object) when is_binary(object) do
def cast(%{"id" => object}), do: cast(object) def cast(%{"id" => object}), do: cast(object)
def cast(_), do: :error def cast(o), do: {:error, o}
tudbut marked this conversation as resolved Outdated

is this signature change actually safe? Why is it done?
Note this is also inconsistent with the return of a prior function clause

is this signature change actually safe? Why is it done? Note this is also inconsistent with the return of a prior function clause

this can be reverted. i experienced several errors that were not specific enough and doing this told me what the issue was. ill revert it if you want

this can be reverted. i experienced several errors that were not specific enough and doing this told me what the issue was. ill revert it if you want

reverting this in the next commit

reverting this in the next commit
def dump(data), do: {:ok, data} def dump(data), do: {:ok, data}

View file

@ -152,6 +152,7 @@ defmodule Pleroma.User do
field(:also_known_as, {:array, ObjectValidators.ObjectID}, default: []) field(:also_known_as, {:array, ObjectValidators.ObjectID}, default: [])
field(:inbox, :string) field(:inbox, :string)
field(:shared_inbox, :string) field(:shared_inbox, :string)
field(:outbox, :string, default: nil)
field(:last_active_at, :naive_datetime) field(:last_active_at, :naive_datetime)
field(:disclose_client, :boolean, default: true) field(:disclose_client, :boolean, default: true)
field(:pinned_objects, :map, default: %{}) field(:pinned_objects, :map, default: %{})
@ -469,6 +470,7 @@ def remote_user_changeset(struct \\ %User{local: false}, params) do
:ap_id, :ap_id,
:inbox, :inbox,
:shared_inbox, :shared_inbox,
:outbox,
:nickname, :nickname,
:avatar, :avatar,
:ap_enabled, :ap_enabled,

View file

@ -1587,7 +1587,8 @@ defp object_to_user_data(data, additional) do
actor_type = data["type"] || "Person" actor_type = data["type"] || "Person"
featured_address = data["featured"] featured_address = data["featured"]
{:ok, pinned_objects} = fetch_and_prepare_featured_from_ap_id(featured_address) {:ok, pinned_objects} = fetch_and_prepare_outbox_from_ap_id(featured_address)
outbox_address = data["outbox"]
# first, check that the owner is correct # first, check that the owner is correct
signing_key = signing_key =
@ -1644,6 +1645,7 @@ defp object_to_user_data(data, additional) do
also_known_as: also_known_as, also_known_as: also_known_as,
signing_key: signing_key, signing_key: signing_key,
inbox: data["inbox"], inbox: data["inbox"],
outbox: outbox_address,
shared_inbox: shared_inbox, shared_inbox: shared_inbox,
pinned_objects: pinned_objects, pinned_objects: pinned_objects,
nickname: nickname nickname: nickname
@ -1790,7 +1792,7 @@ def maybe_handle_clashing_nickname(data) do
end end
end end
def pin_data_from_featured_collection(%{ def activity_data_from_outbox_collection(%{
"type" => "OrderedCollection", "type" => "OrderedCollection",
"first" => first "first" => first
}) do }) do
@ -1805,7 +1807,7 @@ def pin_data_from_featured_collection(%{
end end
end end
def pin_data_from_featured_collection( def activity_data_from_outbox_collection(
%{ %{
"type" => type "type" => type
} = collection } = collection
@ -1821,21 +1823,23 @@ def pin_data_from_featured_collection(
end) end)
end end
def pin_data_from_featured_collection(obj) do def activity_data_from_outbox_collection(obj) do
Logger.error("Could not parse featured collection #{inspect(obj)}") Logger.error("Could not parse outbox collection #{inspect(obj)}")
%{} %{}
end end
def fetch_and_prepare_featured_from_ap_id(nil) do def fetch_and_prepare_outbox_from_ap_id(nil) do
{:ok, %{}} {:ok, %{}}
end end
def fetch_and_prepare_featured_from_ap_id(ap_id) do def fetch_and_prepare_outbox_from_ap_id(ap_id) do
Logger.info("Fetching outbox #{ap_id}")
with {:ok, data} <- Fetcher.fetch_and_contain_remote_object_from_id(ap_id) do with {:ok, data} <- Fetcher.fetch_and_contain_remote_object_from_id(ap_id) do
{:ok, pin_data_from_featured_collection(data)} {:ok, activity_data_from_outbox_collection(data)}
else else
e -> e ->
Logger.error("Could not decode featured collection at fetch #{ap_id}, #{inspect(e)}") Logger.error("Could not decode outbox collection at fetch #{ap_id}, #{inspect(e)}")
{:ok, %{}} {:ok, %{}}
end end
end end
@ -1854,6 +1858,19 @@ def enqueue_pin_fetches(%{pinned_objects: pins}) do
def enqueue_pin_fetches(_), do: nil def enqueue_pin_fetches(_), do: nil
def enqueue_outbox_fetches(%{outbox: outbox_address}) do
# enqueue a task to fetch the outbox
Logger.debug("Refetching outbox #{outbox_address}")
Pleroma.Workers.RemoteFetcherWorker.enqueue("fetch_outbox", %{
"id" => outbox_address
})
:ok
end
def enqueue_outbox_fetches(_), do: :error
def make_user_from_ap_id(ap_id, additional \\ []) do def make_user_from_ap_id(ap_id, additional \\ []) do
user = User.get_cached_by_ap_id(ap_id) user = User.get_cached_by_ap_id(ap_id)

View file

@ -637,7 +637,10 @@ defp handle_incoming_normalised(
end end
end end
defp handle_incoming_normalised(_, _), do: :error defp handle_incoming_normalised(%{"object" => o}, options),
do: handle_incoming_normalised(o, options)
defp handle_incoming_normalised(o, _), do: {:error, {:unknown_object_class, o}}
tudbut marked this conversation as resolved Outdated

Just processing the referenced object if we don't understand the activity is imo a bad idea and likely to break stuff: e.g. consider a malformed Delete which might now end up actually inserting the object instead of deleting it
It can also end up breaking #845 (which i also haven’t reviewed yet)

similarly the changed return signature from :error to a tuple {:error, {.., ..} needs some attention to check whether anything is affected by this

Just processing the referenced object if we don't understand the activity is imo a bad idea and likely to break stuff: e.g. consider a malformed Delete which might now end up actually inserting the object instead of deleting it It can also end up breaking #845 *(which i also haven’t reviewed yet)* similarly the changed return signature from `:error` to a tuple `{:error, {.., ..}` needs some attention to check whether anything is affected by this

the issue is that when fetching other things (e.g. pins, we get objects directly. akkoma just wraps things in an activity no matter what, so we get an activity that's inside an activity)

the issue is that when fetching other things (e.g. pins, we get objects directly. akkoma just wraps things in an activity no matter what, so we get an activity that's inside an activity)

this arises from https://akkoma.dev/AkkomaGang/akkoma/src/branch/develop/lib/pleroma/object/fetcher.ex#L150 where we wrap everything in a fake activity

we can modify said fake create activity to have some sort of indication that it is fake

this arises from https://akkoma.dev/AkkomaGang/akkoma/src/branch/develop/lib/pleroma/object/fetcher.ex#L150 where we wrap everything in a fake activity we can modify said fake create activity to have some sort of indication that it is fake

adding such an indication in the next commit

adding such an indication in the next commit

I don't like yet another "fake activity" marker (We already have one) with different semantics. Instead, the fetch logic should be changed to only wrap objects which actually need this

Both this and #845 could use a new fetchable_objects constant

I don't like yet another "fake activity" marker *(We already have one)* with different semantics. Instead, the fetch logic should be changed to only wrap objects which actually need this Both this and #845 could use a new `fetchable_objects` constant

We already have one?

Where? I couldn't find it personally, but I'm also pretty new to the codebase

> We already have one? Where? I couldn't find it personally, but I'm also pretty new to the codebase

Multiple actually:

"pleroma:fake_object_id"
"pleroma:fakeid"
"pleroma:fakecontext"

and also a "fake" => true property sometimes.

Furthermore, Create activities for fetched remote posts have a null id since a few years or so (and fixing a fallout from this keeps getting defered due to priority and time, see #712 )
The need to deal with them in al subsequent places makes things messy and imo they ideally shouldn't exist at all.
Unfortunately some of this is hard to fix due to fundamental design choices from early days (e.g. requiring the presence of a Create activity for all viewable objects).

Fortunately for what you’re trying to do here, it’s easy enough to just not create unecessary activities in the first place, avoiding all of this.

Multiple actually: ``` "pleroma:fake_object_id" "pleroma:fakeid" "pleroma:fakecontext" ``` and also a `"fake" => true` property sometimes. Furthermore, `Create` activities for fetched remote posts have a `null` id since a few years or so *(and fixing a fallout from this keeps getting defered due to priority and time, see https://akkoma.dev/AkkomaGang/akkoma/issues/712 )* The need to deal with them in al subsequent places makes things messy and imo they ideally shouldn't exist at all. Unfortunately some of this is hard to fix due to fundamental design choices from early days *(e.g. requiring the presence of a `Create` activity for all viewable objects)*. Fortunately for what you’re trying to do here, it’s easy enough to just not create unecessary activities in the first place, avoiding all of this.

Fortunately for what you’re trying to do here, it’s easy enough to just not create unecessary activities in the first place, avoiding all of this.

Wouldn't I need to completely rewrite the function around https://akkoma.dev/AkkomaGang/akkoma/src/branch/develop/lib/pleroma/object/fetcher.ex#L150 for that? Or at the very least completely bypass the function?

I can definitely switch to using "fake" => true or something. Or I can check if the ID is null

> Fortunately for what you’re trying to do here, it’s easy enough to just not create unecessary activities in the first place, avoiding all of this. Wouldn't I need to completely rewrite the function around https://akkoma.dev/AkkomaGang/akkoma/src/branch/develop/lib/pleroma/object/fetcher.ex#L150 for that? Or at the very least completely bypass the function? I can definitely switch to using "fake" => true or something. Or I can check if the ID is null

Instead, the fetch logic should be changed to only wrap objects which actually need this

I just realized this can actually be done relatively easily and will be implementing it now

> Instead, the fetch logic should be changed to only wrap objects which actually need this I just realized this can actually be done relatively easily and will be implementing it now
@spec get_obj_helper(String.t(), Keyword.t()) :: {:ok, Object.t()} | nil @spec get_obj_helper(String.t(), Keyword.t()) :: {:ok, Object.t()} | nil
def get_obj_helper(id, options \\ []) do def get_obj_helper(id, options \\ []) do

View file

@ -306,6 +306,8 @@ def show(%{assigns: %{user: for_user}} = conn, %{id: nickname_or_id} = params) d
def statuses(%{assigns: %{user: reading_user}} = conn, params) do def statuses(%{assigns: %{user: reading_user}} = conn, params) do
with %User{} = user <- User.get_cached_by_nickname_or_id(params.id, for: reading_user), with %User{} = user <- User.get_cached_by_nickname_or_id(params.id, for: reading_user),
:visible <- User.visible_for(user, reading_user) do :visible <- User.visible_for(user, reading_user) do
ActivityPub.enqueue_outbox_fetches(user)

imho this should be limited to, or there should at least be an option to limit this to, authenticated requests only. Processing new posts entailing pulling in their whole threads, new refeenced users and their pinned (now generally latest) posts and their threads and so on can become quite costly

Also no need to enquee fetched if the viewed profile is a local user

Furthermore, there prob should be a timeout (e.g. 60 min) so we don't constantly spam outbox fetches for popular remote users

imho this should be limited to, or there should at least be an option to limit this to, authenticated requests only. Processing new posts entailing pulling in their whole threads, new refeenced users and their ~~pinned~~ *(now generally latest)* posts and their threads and so on can become quite costly Also no need to enquee fetched if the viewed profile is a local user Furthermore, there prob should be a timeout *(e.g. 60 min)* so we don't constantly spam outbox fetches for popular remote users

we do always fetch the outbox, but activities in it are still cached. a timeout may make sense either way.

local users never get an outbox address property, its only added to the user object when its a remote one. so technically we're already not fetching local outboxes. making that more explicit makes sense tho

we do always fetch the outbox, but activities in it are still cached. a timeout may make sense either way. local users never get an outbox address property, its only added to the user object when its a remote one. so technically we're already not fetching local outboxes. making that more explicit makes sense tho

we do always fetch the outbox, but activities in it are still cached.

This is insufficent, since:

  • it still spams remote servers with many requests (due to default uniqueness job constraint: 1 request per second and per remote user); this is not nice on remote servers and might aalso end up creating federation problems for us if we get timeouted due to it
  • it creates unecessary Oban job churn (yes this can matter for perf and also increases PostgreSQL bloat from deleted and not-yet-vacuumed rows; more on that in a future PR once i get to finish what i have on disk)

And also doesn’t address the limitation to authenticated users only. As I mentioned this can be costly and I don’t want unauthenticated users browsing user profiles to ever be able to create such a significant workload for the instance for profiles actual local users might never even look at. (Letting unauthenticated viewers trigger this also exacerbates the request spam issue)

> we do always fetch the outbox, but activities in it are still cached. This is insufficent, since: - it still spams remote servers with many requests *(due to default uniqueness job constraint: 1 request per second and per remote user)*; this is not nice on remote servers and might aalso end up creating federation problems for us if we get timeouted due to it - it creates unecessary Oban job churn *(yes this can matter for perf and also increases PostgreSQL bloat from deleted and not-yet-vacuumed rows; more on that in a future PR once i get to finish what i have on disk)* And also doesn’t address the limitation to authenticated users only. As I mentioned this can be costly and I don’t want unauthenticated users browsing user profiles to ever be able to create such a significant workload for the instance for profiles actual local users might never even look at. *(Letting unauthenticated viewers trigger this also exacerbates the request spam issue)*

I've made it only fetch once per minute per user, and ignore local users. If one minute is too small, I propose making this a setting, though I'm not familiar with how to do that at this point. 10 mins should also be fine to hard-code, but an hour feels a little much.

I've made it only fetch once per minute per user, and ignore local users. If one minute is too small, I propose making this a setting, though I'm not familiar with how to do that at this point. 10 mins should also be fine to hard-code, but an hour feels a little much.

The timeout mechanism seems like it should work and uses mechansims you’re familiar with
Though this much additional DB traffic to frequently update timestamp might (untested) negatively affect overall perf. Given the cooldown period is well within typicall continuous-uptime, I was thinking of just using a runtime-only hint in caches; see e.g. #762 for an example of a (new) cachex set being used for timeouts

To add the restriction for logged-in users, see e.g. lib/pleroma/web/mastodon_api/controllers/timeline_controller.ex’s public/2 function

If this is too much, we can also adjust those bits later before merge (though it might take a while)

The timeout mechanism seems like it should work and uses mechansims you’re familiar with Though this much additional DB traffic to frequently update timestamp might *(untested)* negatively affect overall perf. Given the cooldown period is well within typicall continuous-uptime, I was thinking of just using a runtime-only hint in caches; see e.g. #762 for an example of a (new) `cachex` set being used for timeouts To add the restriction for logged-in users, see e.g. `lib/pleroma/web/mastodon_api/controllers/timeline_controller.ex`’s `public/2` function If this is too much, we can also adjust those bits later before merge *(though it might take a while)*

I have now added the cachex bit. Ill be pushing this once i have verified it still works, and will next be looking at the logged in users restriction.

I have now added the cachex bit. Ill be pushing this once i have verified it still works, and will next be looking at the logged in users restriction.

Okay! Hope this is good now (no more when not logged in, cachex for timeouts). If so, I think that would be all in terms of what you requested, but let me know if I've missed anything!

Okay! Hope this is good now (no more when not logged in, cachex for timeouts). If so, I think that would be all in terms of what you requested, but let me know if I've missed anything!
params = params =
params params
|> Map.delete(:tagged) |> Map.delete(:tagged)

View file

@ -4,12 +4,32 @@
defmodule Pleroma.Workers.RemoteFetcherWorker do defmodule Pleroma.Workers.RemoteFetcherWorker do
alias Pleroma.Object.Fetcher alias Pleroma.Object.Fetcher
alias Pleroma.Object
alias Pleroma.Web.ActivityPub.ActivityPub
use Pleroma.Workers.WorkerHelper, use Pleroma.Workers.WorkerHelper,
queue: "remote_fetcher", queue: "remote_fetcher",
unique: [period: 300, states: Oban.Job.states(), keys: [:op, :id]] unique: [period: 300, states: Oban.Job.states(), keys: [:op, :id]]
@impl Oban.Worker @impl Oban.Worker
def perform(%Job{args: %{"op" => "fetch_outbox", "id" => address}}) do
with {:ok, outbox} <- ActivityPub.fetch_and_prepare_outbox_from_ap_id(address) do
Enum.each(Enum.reverse(outbox), fn {ap_id, _} ->
if is_nil(Object.get_cached_by_ap_id(ap_id)) do
enqueue("fetch_remote", %{
"id" => ap_id,
"depth" => 1
})
end
end)
:ok
else
e -> {:error, e}
end
end
def perform(%Job{args: %{"op" => "fetch_remote", "id" => id} = args}) do def perform(%Job{args: %{"op" => "fetch_remote", "id" => id} = args}) do
case Fetcher.fetch_object_from_id(id, depth: args["depth"]) do case Fetcher.fetch_object_from_id(id, depth: args["depth"]) do
{:ok, _object} -> {:ok, _object} ->

View file

@ -0,0 +1,15 @@
defmodule Pleroma.Repo.Migrations.UsersAddOutboxes do
use Ecto.Migration
def up do
alter table(:users) do
add_if_not_exists(:outbox, :text)
end
end
tudbut marked this conversation as resolved Outdated

This does not generally hold

This does not generally hold

im aware. but for migrating instances, no outboxes will be populated at first so this is kinda the only solution if you want previously fetched users to have the property

im aware. but for migrating instances, no outboxes will be populated at first so this is kinda the only solution if you want previously fetched users to have the property

any idea how to do this more accurately?

any idea how to do this more accurately?

don’t populate it and wait for the next user refresh to occur or Update activity to arrive filling outbox with the correct value

don’t populate it and wait for the next user refresh to occur or Update activity to arrive filling outbox with the correct value

(also this breaks for e.g. users named "inbox")

(also this breaks for e.g. users named "inbox")
def down do
alter table(:users) do
remove_if_exists(:outbox, :text)
end
end
end