RichMedia refactor
Rich Media parsing was previously handled on-demand with a 2 second HTTP request timeout and retained only in Cachex. Every time a Pleroma instance is restarted it will have to request and parse the data for each status with a URL detected. When fetching a batch of statuses they were processed in parallel to attempt to keep the maximum latency at 2 seconds, but often resulted in a timeline appearing to hang during loading due to a URL that could not be successfully reached. URLs which had images links that expire (Amazon AWS) were parsed and inserted with a TTL to ensure the image link would not break. Rich Media data is now cached in the database and fetched asynchronously. Cachex is used as a read-through cache. When the data becomes available we stream an update to the clients. If the result is returned quickly the experience is almost seamless. Activities were already processed for their Rich Media data during ingestion to warm the cache, so users should not normally encounter the asynchronous loading of the Rich Media data. Implementation notes: - The async worker is a Task with a globally unique process name to prevent duplicate processing of the same URL - The Task will attempt to fetch the data 3 times with increasing sleep time between attempts - The HTTP request obeys the default HTTP request timeout value instead of 2 seconds - URLs that cannot be successfully parsed due to an unexpected error receives a negative cache entry for 15 minutes - URLs that fail with an expected error will receive a negative cache with no TTL - Activities that have no detected URLs insert a nil value in the Cachex :scrubber_cache so we do not repeat parsing the object content with Floki every time the activity is rendered - Expiring image URLs are handled with an Oban job - There is no automatic cleanup of the Rich Media data in the database, but it is safe to delete at any time - The post draft/preview feature makes the URL processing synchronous so the rendered post preview will have an accurate rendering Overall performance of timelines and creating new posts which contain URLs is greatly improved.
This commit is contained in:
parent
a924e117fd
commit
5da9cbd8a5
16 changed files with 431 additions and 240 deletions
1
changelog.d/rich_media_refactor.change
Normal file
1
changelog.d/rich_media_refactor.change
Normal file
|
@ -0,0 +1 @@
|
|||
Refactored Rich Media to cache the content in the database. Fetching operations that could block status rendering have been eliminated.
|
|
@ -579,7 +579,8 @@
|
|||
mute_expire: 5,
|
||||
search_indexing: 10,
|
||||
nodeinfo_fetcher: 1,
|
||||
database_prune: 1
|
||||
database_prune: 1,
|
||||
rich_media_expiration: 2
|
||||
],
|
||||
plugins: [
|
||||
Oban.Plugins.Pruner,
|
||||
|
|
|
@ -141,6 +141,8 @@
|
|||
config :pleroma, :instances_favicons, enabled: false
|
||||
config :pleroma, :instances_nodeinfo, enabled: false
|
||||
|
||||
config :pleroma, Pleroma.Web.RichMedia.Backfill, provider: Pleroma.Web.RichMedia.Backfill
|
||||
|
||||
if File.exists?("./config/test.secret.exs") do
|
||||
import_config "test.secret.exs"
|
||||
else
|
||||
|
|
|
@ -67,22 +67,9 @@ def ensure_scrubbed_html(
|
|||
end
|
||||
end
|
||||
|
||||
def extract_first_external_url_from_object(%{data: %{"content" => content}} = object)
|
||||
@spec extract_first_external_url_from_object(Pleroma.Object.t()) :: String.t() | nil
|
||||
def extract_first_external_url_from_object(%{data: %{"content" => content}})
|
||||
when is_binary(content) do
|
||||
unless object.data["fake"] do
|
||||
key = "URL|#{object.id}"
|
||||
|
||||
@cachex.fetch!(:scrubber_cache, key, fn _key ->
|
||||
{:commit, {:ok, extract_first_external_url(content)}}
|
||||
end)
|
||||
else
|
||||
{:ok, extract_first_external_url(content)}
|
||||
end
|
||||
end
|
||||
|
||||
def extract_first_external_url_from_object(_), do: {:error, :no_content}
|
||||
|
||||
def extract_first_external_url(content) do
|
||||
content
|
||||
|> Floki.parse_fragment!()
|
||||
|> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
|
||||
|
@ -90,4 +77,6 @@ def extract_first_external_url(content) do
|
|||
|> Floki.attribute("href")
|
||||
|> Enum.at(0)
|
||||
end
|
||||
|
||||
def extract_first_external_url_from_object(_), do: nil
|
||||
end
|
||||
|
|
|
@ -155,9 +155,7 @@ def insert(map, local \\ true, fake \\ false, bypass_actor_check \\ false) when
|
|||
# Splice in the child object if we have one.
|
||||
activity = Maps.put_if_present(activity, :object, object)
|
||||
|
||||
ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn ->
|
||||
Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end)
|
||||
end)
|
||||
Pleroma.Web.RichMedia.Card.get_by_activity(activity)
|
||||
|
||||
# Add local posts to search index
|
||||
if local, do: Pleroma.Search.add_to_index(activity)
|
||||
|
@ -185,7 +183,7 @@ def insert(map, local \\ true, fake \\ false, bypass_actor_check \\ false) when
|
|||
id: "pleroma:fakeid"
|
||||
}
|
||||
|
||||
Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
|
||||
Pleroma.Web.RichMedia.Card.get_by_activity(activity)
|
||||
{:ok, activity}
|
||||
|
||||
{:remote_limit_pass, _} ->
|
||||
|
|
|
@ -225,9 +225,7 @@ def handle(%{data: %{"type" => "Create"}} = activity, meta) do
|
|||
end
|
||||
end
|
||||
|
||||
ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn ->
|
||||
Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end)
|
||||
end)
|
||||
Pleroma.Web.RichMedia.Card.get_by_activity(activity)
|
||||
|
||||
Pleroma.Search.add_to_index(Map.put(activity, :object, object))
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do
|
|||
alias Pleroma.Web.OAuth.Token
|
||||
alias Pleroma.Web.Plugs.OAuthScopesPlug
|
||||
alias Pleroma.Web.Plugs.RateLimiter
|
||||
alias Pleroma.Web.RichMedia.Card
|
||||
|
||||
plug(Pleroma.Web.ApiSpec.CastAndValidate)
|
||||
|
||||
|
@ -383,6 +384,21 @@ def unmute_conversation(%{assigns: %{user: user}} = conn, %{id: id}) do
|
|||
end
|
||||
end
|
||||
|
||||
@doc "GET /api/v1/statuses/:id/card"
|
||||
@deprecated "https://github.com/tootsuite/mastodon/pull/11213"
|
||||
def card(
|
||||
%{assigns: %{user: user}, private: %{open_api_spex: %{params: %{id: status_id}}}} = conn,
|
||||
_
|
||||
) do
|
||||
with %Activity{} = activity <- Activity.get_by_id(status_id),
|
||||
true <- Visibility.visible_for_user?(activity, user),
|
||||
%Card{} = card_data <- Card.get_by_activity(activity) do
|
||||
render(conn, "card.json", card_data)
|
||||
else
|
||||
_ -> render_error(conn, :not_found, "Record not found")
|
||||
end
|
||||
end
|
||||
|
||||
@doc "GET /api/v1/statuses/:id/favourited_by"
|
||||
def favourited_by(%{assigns: %{user: user}} = conn, %{id: id}) do
|
||||
with true <- Pleroma.Config.get([:instance, :show_reactions]),
|
||||
|
|
|
@ -22,6 +22,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
|
|||
alias Pleroma.Web.MediaProxy
|
||||
alias Pleroma.Web.PleromaAPI.EmojiReactionController
|
||||
require Logger
|
||||
alias Pleroma.Web.RichMedia.Card
|
||||
|
||||
import Pleroma.Web.ActivityPub.Visibility, only: [get_visibility: 1, visible_for_user?: 2]
|
||||
|
||||
|
@ -30,9 +31,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
|
|||
# pagination is restricted to 40 activities at a time
|
||||
defp fetch_rich_media_for_activities(activities) do
|
||||
Enum.each(activities, fn activity ->
|
||||
spawn(fn ->
|
||||
Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
|
||||
end)
|
||||
spawn(fn -> Card.get_by_activity(activity) end)
|
||||
end)
|
||||
end
|
||||
|
||||
|
@ -93,9 +92,7 @@ def render("index.json", opts) do
|
|||
# To do: check AdminAPIControllerTest on the reasons behind nil activities in the list
|
||||
activities = Enum.filter(opts.activities, & &1)
|
||||
|
||||
# Start fetching rich media before doing anything else, so that later calls to get the cards
|
||||
# only block for timeout in the worst case, as opposed to
|
||||
# length(activities_with_links) * timeout
|
||||
# Start prefetching rich media before doing anything else
|
||||
fetch_rich_media_for_activities(activities)
|
||||
replied_to_activities = get_replied_to_activities(activities)
|
||||
|
||||
|
@ -301,12 +298,82 @@ def render("show.json", %{activity: %{id: id, data: %{"object" => _object}} = ac
|
|||
object
|
||||
|> render_content()
|
||||
|
||||
content_html =
|
||||
content
|
||||
|> Activity.HTML.get_cached_scrubbed_html_for_activity(
|
||||
User.html_filter_policy(opts[:for]),
|
||||
activity,
|
||||
"mastoapi:content:#{chrono_order}"
|
||||
quote_post =
|
||||
if visible_for_user?(quote_activity, opts[:for]) and opts[:show_quote] != false do
|
||||
quote_rendering_opts = Map.merge(opts, %{activity: quote_activity, show_quote: false})
|
||||
render("show.json", quote_rendering_opts)
|
||||
else
|
||||
nil
|
||||
end
|
||||
|
||||
content =
|
||||
object
|
||||
|> render_content()
|
||||
|
||||
content_html =
|
||||
content
|
||||
|> Activity.HTML.get_cached_scrubbed_html_for_activity(
|
||||
User.html_filter_policy(opts[:for]),
|
||||
activity,
|
||||
"mastoapi:content:#{chrono_order}"
|
||||
)
|
||||
|
||||
content_plaintext =
|
||||
content
|
||||
|> Activity.HTML.get_cached_stripped_html_for_activity(
|
||||
activity,
|
||||
"mastoapi:content:#{chrono_order}"
|
||||
)
|
||||
|
||||
summary = object.data["summary"] || ""
|
||||
|
||||
card =
|
||||
case Card.get_by_activity(activity) do
|
||||
%Card{} = result -> render("card.json", result)
|
||||
_ -> nil
|
||||
end
|
||||
|
||||
url =
|
||||
if user.local do
|
||||
Pleroma.Web.Router.Helpers.o_status_url(Pleroma.Web.Endpoint, :notice, activity)
|
||||
else
|
||||
object.data["url"] || object.data["external_url"] || object.data["id"]
|
||||
end
|
||||
|
||||
direct_conversation_id =
|
||||
with {_, nil} <- {:direct_conversation_id, opts[:direct_conversation_id]},
|
||||
{_, true} <- {:include_id, opts[:with_direct_conversation_id]},
|
||||
{_, %User{} = for_user} <- {:for_user, opts[:for]} do
|
||||
Activity.direct_conversation_id(activity, for_user)
|
||||
else
|
||||
{:direct_conversation_id, participation_id} when is_integer(participation_id) ->
|
||||
participation_id
|
||||
|
||||
_e ->
|
||||
nil
|
||||
end
|
||||
|
||||
emoji_reactions =
|
||||
object
|
||||
|> Object.get_emoji_reactions()
|
||||
|> EmojiReactionController.filter_allowed_users(
|
||||
opts[:for],
|
||||
Map.get(opts, :with_muted, false)
|
||||
)
|
||||
|> Stream.map(fn {emoji, users, url} ->
|
||||
build_emoji_map(emoji, users, url, opts[:for])
|
||||
end)
|
||||
|> Enum.to_list()
|
||||
|
||||
# Status muted state (would do 1 request per status unless user mutes are preloaded)
|
||||
muted =
|
||||
thread_muted? ||
|
||||
UserRelationship.exists?(
|
||||
get_in(opts, [:relationships, :user_relationships]),
|
||||
:mute,
|
||||
opts[:for],
|
||||
user,
|
||||
fn for_user, user -> User.mutes?(for_user, user) end
|
||||
)
|
||||
|
||||
content_plaintext =
|
||||
|
@ -528,15 +595,8 @@ def render("source.json", %{activity: %{data: %{"object" => _object}} = activity
|
|||
}
|
||||
end
|
||||
|
||||
def render("card.json", %{rich_media: rich_media, page_url: page_url}) do
|
||||
page_url_data = URI.parse(page_url)
|
||||
|
||||
page_url_data =
|
||||
if is_binary(rich_media["url"]) do
|
||||
URI.merge(page_url_data, URI.parse(rich_media["url"]))
|
||||
else
|
||||
page_url_data
|
||||
end
|
||||
def render("card.json", %Card{fields: rich_media}) do
|
||||
page_url_data = URI.parse(rich_media["url"])
|
||||
|
||||
page_url = page_url_data |> to_string
|
||||
|
||||
|
|
101
lib/pleroma/web/rich_media/backfill.ex
Normal file
101
lib/pleroma/web/rich_media/backfill.ex
Normal file
|
@ -0,0 +1,101 @@
|
|||
# Pleroma: A lightweight social networking server
|
||||
# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
|
||||
# SPDX-License-Identifier: AGPL-3.0-only
|
||||
|
||||
defmodule Pleroma.Web.RichMedia.Backfill.Task do
|
||||
alias Pleroma.Web.RichMedia.Backfill
|
||||
|
||||
def run(args) do
|
||||
Task.Supervisor.start_child(Pleroma.TaskSupervisor, Backfill, :run, [args],
|
||||
name: {:global, {:rich_media, args.url_hash}}
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
defmodule Pleroma.Web.RichMedia.Backfill do
|
||||
alias Pleroma.Web.RichMedia.Card
|
||||
alias Pleroma.Web.RichMedia.Parser
|
||||
alias Pleroma.Web.RichMedia.Parser.TTL
|
||||
alias Pleroma.Workers.RichMediaExpirationWorker
|
||||
|
||||
require Logger
|
||||
|
||||
@backfiller Pleroma.Config.get([__MODULE__, :provider], Pleroma.Web.RichMedia.Backfill.Task)
|
||||
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
|
||||
@max_attempts 3
|
||||
@retry 5_000
|
||||
|
||||
def start(%{url: url} = args) when is_binary(url) do
|
||||
url_hash = Card.url_to_hash(url)
|
||||
|
||||
args =
|
||||
args
|
||||
|> Map.put(:attempt, 1)
|
||||
|> Map.put(:url_hash, url_hash)
|
||||
|
||||
@backfiller.run(args)
|
||||
end
|
||||
|
||||
def run(%{url: url, url_hash: url_hash, attempt: attempt} = args)
|
||||
when attempt <= @max_attempts do
|
||||
case Parser.parse(url) do
|
||||
{:ok, fields} ->
|
||||
{:ok, card} = Card.create(url, fields)
|
||||
|
||||
maybe_schedule_expiration(url, fields)
|
||||
|
||||
if Map.has_key?(args, :activity_id) do
|
||||
stream_update(args)
|
||||
end
|
||||
|
||||
warm_cache(url_hash, card)
|
||||
|
||||
{:error, {:invalid_metadata, fields}} ->
|
||||
Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}")
|
||||
negative_cache(url_hash)
|
||||
|
||||
{:error, :body_too_large} ->
|
||||
Logger.error("Rich media error for #{url}: :body_too_large")
|
||||
negative_cache(url_hash)
|
||||
|
||||
{:error, {:content_type, type}} ->
|
||||
Logger.debug("Rich media error for #{url}: :content_type is #{type}")
|
||||
negative_cache(url_hash)
|
||||
|
||||
e ->
|
||||
Logger.debug("Rich media error for #{url}: #{inspect(e)}")
|
||||
|
||||
:timer.sleep(@retry * attempt)
|
||||
|
||||
run(%{args | attempt: attempt + 1})
|
||||
end
|
||||
end
|
||||
|
||||
def run(%{url: url, url_hash: url_hash}) do
|
||||
Logger.debug("Rich media failure for #{url}")
|
||||
|
||||
negative_cache(url_hash, :timer.minutes(15))
|
||||
end
|
||||
|
||||
defp maybe_schedule_expiration(url, fields) do
|
||||
case TTL.get_from_image(fields, url) do
|
||||
ttl when is_number(ttl) ->
|
||||
timestamp = DateTime.from_unix!(ttl)
|
||||
|
||||
RichMediaExpirationWorker.new(%{"url" => url}, scheduled_at: timestamp)
|
||||
|> Oban.insert()
|
||||
|
||||
_ ->
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
defp stream_update(%{activity_id: activity_id}) do
|
||||
Pleroma.Activity.get_by_id(activity_id)
|
||||
|> Pleroma.Activity.normalize()
|
||||
|> Pleroma.Web.ActivityPub.ActivityPub.stream_out()
|
||||
end
|
||||
|
||||
defp warm_cache(key, val), do: @cachex.put(:rich_media_cache, key, val)
|
||||
defp negative_cache(key, ttl \\ nil), do: @cachex.put(:rich_media_cache, key, nil, ttl: ttl)
|
||||
end
|
157
lib/pleroma/web/rich_media/card.ex
Normal file
157
lib/pleroma/web/rich_media/card.ex
Normal file
|
@ -0,0 +1,157 @@
|
|||
defmodule Pleroma.Web.RichMedia.Card do
|
||||
use Ecto.Schema
|
||||
import Ecto.Changeset
|
||||
import Ecto.Query
|
||||
|
||||
alias Pleroma.Activity
|
||||
alias Pleroma.HTML
|
||||
alias Pleroma.Object
|
||||
alias Pleroma.Repo
|
||||
alias Pleroma.Web.RichMedia.Backfill
|
||||
alias Pleroma.Web.RichMedia.Parser
|
||||
|
||||
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
|
||||
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
|
||||
|
||||
@type t :: %__MODULE__{}
|
||||
|
||||
schema "rich_media_card" do
|
||||
field(:url_hash, :binary)
|
||||
field(:fields, :map)
|
||||
|
||||
timestamps()
|
||||
end
|
||||
|
||||
@doc false
|
||||
def changeset(card, attrs) do
|
||||
card
|
||||
|> cast(attrs, [:url_hash, :fields])
|
||||
|> validate_required([:url_hash, :fields])
|
||||
|> unique_constraint(:url_hash)
|
||||
end
|
||||
|
||||
@spec create(String.t(), map()) :: {:ok, t()}
|
||||
def create(url, fields) do
|
||||
url_hash = url_to_hash(url)
|
||||
|
||||
fields = Map.put_new(fields, "url", url)
|
||||
|
||||
%__MODULE__{}
|
||||
|> changeset(%{url_hash: url_hash, fields: fields})
|
||||
|> Repo.insert(on_conflict: {:replace, [:fields]}, conflict_target: :url_hash)
|
||||
end
|
||||
|
||||
@spec delete(String.t()) :: {:ok, Ecto.Schema.t()} | {:error, Ecto.Changeset.t()} | :ok
|
||||
def delete(url) do
|
||||
url_hash = url_to_hash(url)
|
||||
@cachex.del(:rich_media_cache, url_hash)
|
||||
|
||||
case get_by_url(url) do
|
||||
%__MODULE{} = card -> Repo.delete(card)
|
||||
nil -> :ok
|
||||
end
|
||||
end
|
||||
|
||||
@spec get_by_url(String.t() | nil) :: t() | nil | :error
|
||||
def get_by_url(url) when is_binary(url) do
|
||||
if @config_impl.get([:rich_media, :enabled]) do
|
||||
url_hash = url_to_hash(url)
|
||||
|
||||
@cachex.fetch!(:rich_media_cache, url_hash, fn _ ->
|
||||
result =
|
||||
__MODULE__
|
||||
|> where(url_hash: ^url_hash)
|
||||
|> Repo.one()
|
||||
|
||||
case result do
|
||||
%__MODULE__{} = card -> {:commit, card}
|
||||
_ -> {:ignore, nil}
|
||||
end
|
||||
end)
|
||||
else
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
def get_by_url(nil), do: nil
|
||||
|
||||
@spec get_or_backfill_by_url(String.t(), map()) :: t() | nil
|
||||
def get_or_backfill_by_url(url, backfill_opts \\ %{}) do
|
||||
case get_by_url(url) do
|
||||
%__MODULE__{} = card ->
|
||||
card
|
||||
|
||||
nil ->
|
||||
backfill_opts = Map.put(backfill_opts, :url, url)
|
||||
|
||||
Backfill.start(backfill_opts)
|
||||
|
||||
nil
|
||||
|
||||
:error ->
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
@spec get_by_object(Object.t()) :: t() | nil | :error
|
||||
def get_by_object(object) do
|
||||
case HTML.extract_first_external_url_from_object(object) do
|
||||
nil -> nil
|
||||
url -> get_or_backfill_by_url(url)
|
||||
end
|
||||
end
|
||||
|
||||
@spec get_by_activity(Activity.t()) :: t() | nil | :error
|
||||
# Fake/Draft activity
|
||||
def get_by_activity(%Activity{id: "pleroma:fakeid"} = activity) do
|
||||
with %Object{} = object <- Object.normalize(activity, fetch: false),
|
||||
url when not is_nil(url) <- HTML.extract_first_external_url_from_object(object) do
|
||||
case get_by_url(url) do
|
||||
# Cache hit
|
||||
%__MODULE__{} = card ->
|
||||
card
|
||||
|
||||
# Cache miss, but fetch for rendering the Draft
|
||||
_ ->
|
||||
with {:ok, fields} <- Parser.parse(url),
|
||||
{:ok, card} <- create(url, fields) do
|
||||
card
|
||||
else
|
||||
_ -> nil
|
||||
end
|
||||
end
|
||||
else
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def get_by_activity(activity) do
|
||||
with %Object{} = object <- Object.normalize(activity, fetch: false),
|
||||
{_, nil} <- {:cached, get_cached_url(object, activity.id)} do
|
||||
nil
|
||||
else
|
||||
{:cached, url} ->
|
||||
get_or_backfill_by_url(url, %{activity_id: activity.id})
|
||||
|
||||
_ ->
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
@spec url_to_hash(String.t()) :: String.t()
|
||||
def url_to_hash(url) do
|
||||
:crypto.hash(:sha256, url) |> Base.encode16(case: :lower)
|
||||
end
|
||||
|
||||
defp get_cached_url(object, activity_id) do
|
||||
key = "URL|#{activity_id}"
|
||||
|
||||
@cachex.fetch!(:scrubber_cache, key, fn _ ->
|
||||
url = HTML.extract_first_external_url_from_object(object)
|
||||
Activity.HTML.add_cache_key_for(activity_id, key)
|
||||
|
||||
{:commit, url}
|
||||
end)
|
||||
end
|
||||
end
|
|
@ -3,85 +3,13 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-only
|
||||
|
||||
defmodule Pleroma.Web.RichMedia.Helpers do
|
||||
alias Pleroma.Activity
|
||||
alias Pleroma.Config
|
||||
alias Pleroma.HTML
|
||||
alias Pleroma.Object
|
||||
alias Pleroma.Web.RichMedia.Parser
|
||||
|
||||
@options [
|
||||
max_body: 2_000_000,
|
||||
receive_timeout: 2_000
|
||||
]
|
||||
|
||||
@spec validate_page_url(URI.t() | binary()) :: :ok | :error
|
||||
defp validate_page_url(page_url) when is_binary(page_url) do
|
||||
validate_tld = Config.get([Pleroma.Formatter, :validate_tld])
|
||||
|
||||
page_url
|
||||
|> Linkify.Parser.url?(validate_tld: validate_tld)
|
||||
|> parse_uri(page_url)
|
||||
end
|
||||
|
||||
defp validate_page_url(%URI{host: host, scheme: "https", authority: authority})
|
||||
when is_binary(authority) do
|
||||
cond do
|
||||
host in Config.get([:rich_media, :ignore_hosts], []) ->
|
||||
:error
|
||||
|
||||
get_tld(host) in Config.get([:rich_media, :ignore_tld], []) ->
|
||||
:error
|
||||
|
||||
true ->
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
defp validate_page_url(_), do: :error
|
||||
|
||||
defp parse_uri(true, url) do
|
||||
url
|
||||
|> URI.parse()
|
||||
|> validate_page_url
|
||||
end
|
||||
|
||||
defp parse_uri(_, _), do: :error
|
||||
|
||||
defp get_tld(host) do
|
||||
host
|
||||
|> String.split(".")
|
||||
|> Enum.reverse()
|
||||
|> hd
|
||||
end
|
||||
|
||||
def fetch_data_for_object(object) do
|
||||
with true <- Config.get([:rich_media, :enabled]),
|
||||
{:ok, page_url} <-
|
||||
HTML.extract_first_external_url_from_object(object),
|
||||
:ok <- validate_page_url(page_url),
|
||||
{:ok, rich_media} <- Parser.parse(page_url) do
|
||||
%{page_url: page_url, rich_media: rich_media}
|
||||
else
|
||||
_ -> %{}
|
||||
end
|
||||
end
|
||||
|
||||
def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do
|
||||
with true <- Config.get([:rich_media, :enabled]),
|
||||
%Object{} = object <- Object.normalize(activity, fetch: false) do
|
||||
fetch_data_for_object(object)
|
||||
else
|
||||
_ -> %{}
|
||||
end
|
||||
end
|
||||
|
||||
def fetch_data_for_activity(_), do: %{}
|
||||
|
||||
def rich_media_get(url) do
|
||||
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
|
||||
|
||||
head_check =
|
||||
case Pleroma.HTTP.head(url, headers, @options) do
|
||||
case Pleroma.HTTP.head(url, headers, http_options()) do
|
||||
# If the HEAD request didn't reach the server for whatever reason,
|
||||
# we assume the GET that comes right after won't either
|
||||
{:error, _} = e ->
|
||||
|
@ -96,7 +24,7 @@ def rich_media_get(url) do
|
|||
:ok
|
||||
end
|
||||
|
||||
with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, @options)
|
||||
with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
|
||||
end
|
||||
|
||||
defp check_content_type(headers) do
|
||||
|
@ -112,12 +40,13 @@ defp check_content_type(headers) do
|
|||
end
|
||||
end
|
||||
|
||||
@max_body @options[:max_body]
|
||||
defp check_content_length(headers) do
|
||||
max_body = Keyword.get(http_options(), :max_body)
|
||||
|
||||
case List.keyfind(headers, "content-length", 0) do
|
||||
{_, maybe_content_length} ->
|
||||
case Integer.parse(maybe_content_length) do
|
||||
{content_length, ""} when content_length <= @max_body -> :ok
|
||||
{content_length, ""} when content_length <= max_body -> :ok
|
||||
{_, ""} -> {:error, :body_too_large}
|
||||
_ -> :ok
|
||||
end
|
||||
|
@ -126,4 +55,11 @@ defp check_content_length(headers) do
|
|||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
defp http_options() do
|
||||
[
|
||||
pool: :media,
|
||||
max_body: Config.get([:rich_media, :max_body], 2_000_000)
|
||||
]
|
||||
end
|
||||
end
|
||||
|
|
|
@ -5,137 +5,28 @@
|
|||
defmodule Pleroma.Web.RichMedia.Parser do
|
||||
require Logger
|
||||
|
||||
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
|
||||
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
|
||||
|
||||
defp parsers do
|
||||
Pleroma.Config.get([:rich_media, :parsers])
|
||||
end
|
||||
|
||||
def parse(nil), do: {:error, "No URL provided"}
|
||||
def parse(nil), do: nil
|
||||
|
||||
if Pleroma.Config.get(:env) == :test do
|
||||
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
|
||||
def parse(url), do: parse_with_timeout(url)
|
||||
else
|
||||
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
|
||||
def parse(url) do
|
||||
with {:ok, data} <- get_cached_or_parse(url),
|
||||
{:ok, _} <- set_ttl_based_on_image(data, url) do
|
||||
{:ok, data}
|
||||
end
|
||||
end
|
||||
|
||||
defp get_cached_or_parse(url) do
|
||||
case @cachex.fetch(:rich_media_cache, url, fn ->
|
||||
case parse_with_timeout(url) do
|
||||
{:ok, _} = res ->
|
||||
{:commit, res}
|
||||
|
||||
{:error, reason} = e ->
|
||||
# Unfortunately we have to log errors here, instead of doing that
|
||||
# along with ttl setting at the bottom. Otherwise we can get log spam
|
||||
# if more than one process was waiting for the rich media card
|
||||
# while it was generated. Ideally we would set ttl here as well,
|
||||
# so we don't override it number_of_waiters_on_generation
|
||||
# times, but one, obviously, can't set ttl for not-yet-created entry
|
||||
# and Cachex doesn't support returning ttl from the fetch callback.
|
||||
log_error(url, reason)
|
||||
{:commit, e}
|
||||
end
|
||||
end) do
|
||||
{action, res} when action in [:commit, :ok] ->
|
||||
case res do
|
||||
{:ok, _data} = res ->
|
||||
res
|
||||
|
||||
{:error, reason} = e ->
|
||||
if action == :commit, do: set_error_ttl(url, reason)
|
||||
e
|
||||
end
|
||||
|
||||
{:error, e} ->
|
||||
{:error, {:cachex_error, e}}
|
||||
end
|
||||
end
|
||||
|
||||
defp set_error_ttl(_url, :body_too_large), do: :ok
|
||||
defp set_error_ttl(_url, {:content_type, _}), do: :ok
|
||||
|
||||
# The TTL is not set for the errors above, since they are unlikely to change
|
||||
# with time
|
||||
|
||||
defp set_error_ttl(url, _reason) do
|
||||
ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
|
||||
@cachex.expire(:rich_media_cache, url, ttl)
|
||||
:ok
|
||||
end
|
||||
|
||||
defp log_error(url, {:invalid_metadata, data}) do
|
||||
Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
|
||||
end
|
||||
|
||||
defp log_error(url, reason) do
|
||||
Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
|
||||
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
|
||||
def parse(url) do
|
||||
with :ok <- validate_page_url(url),
|
||||
{:ok, data} <- parse_url(url) do
|
||||
data = Map.put(data, "url", url)
|
||||
{:ok, data}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Set the rich media cache based on the expiration time of image.
|
||||
|
||||
Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`
|
||||
|
||||
## Example
|
||||
|
||||
defmodule MyModule do
|
||||
@behaviour Pleroma.Web.RichMedia.Parser.TTL
|
||||
def ttl(data, url) do
|
||||
image_url = Map.get(data, :image)
|
||||
# do some parsing in the url and get the ttl of the image
|
||||
# and return ttl is unix time
|
||||
parse_ttl_from_url(image_url)
|
||||
end
|
||||
end
|
||||
|
||||
Define the module in the config
|
||||
|
||||
config :pleroma, :rich_media,
|
||||
ttl_setters: [MyModule]
|
||||
"""
|
||||
@spec set_ttl_based_on_image(map(), String.t()) ::
|
||||
{:ok, Integer.t() | :noop} | {:error, :no_key}
|
||||
def set_ttl_based_on_image(data, url) do
|
||||
case get_ttl_from_image(data, url) do
|
||||
{:ok, ttl} when is_number(ttl) ->
|
||||
ttl = ttl * 1000
|
||||
|
||||
case @cachex.expire_at(:rich_media_cache, url, ttl) do
|
||||
{:ok, true} -> {:ok, ttl}
|
||||
{:ok, false} -> {:error, :no_key}
|
||||
end
|
||||
|
||||
_ ->
|
||||
{:ok, :noop}
|
||||
end
|
||||
end
|
||||
|
||||
defp get_ttl_from_image(data, url) do
|
||||
[:rich_media, :ttl_setters]
|
||||
|> Pleroma.Config.get()
|
||||
|> Enum.reduce({:ok, nil}, fn
|
||||
module, {:ok, _ttl} ->
|
||||
module.ttl(data, url)
|
||||
|
||||
_, error ->
|
||||
error
|
||||
end)
|
||||
end
|
||||
|
||||
def parse_url(url) do
|
||||
defp parse_url(url) do
|
||||
with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
|
||||
{:ok, html} <- Floki.parse_document(html) do
|
||||
html
|
||||
|> maybe_parse()
|
||||
|> Map.put("url", url)
|
||||
|> clean_parsed_data()
|
||||
|> check_parsed_data()
|
||||
end
|
||||
|
|
|
@ -3,5 +3,17 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-only
|
||||
|
||||
defmodule Pleroma.Web.RichMedia.Parser.TTL do
|
||||
@callback ttl(Map.t(), String.t()) :: Integer.t() | nil
|
||||
@callback ttl(map(), String.t()) :: integer() | nil
|
||||
|
||||
def get_from_image(data, url) do
|
||||
[:rich_media, :ttl_setters]
|
||||
|> Pleroma.Config.get()
|
||||
|> Enum.reduce({:ok, nil}, fn
|
||||
module, {:ok, _ttl} ->
|
||||
module.ttl(data, url)
|
||||
|
||||
_, error ->
|
||||
error
|
||||
end)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -7,7 +7,7 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do
|
|||
|
||||
@impl true
|
||||
def ttl(data, _url) do
|
||||
image = Map.get(data, :image)
|
||||
image = Map.get(data, "image")
|
||||
|
||||
if is_aws_signed_url(image) do
|
||||
image
|
||||
|
|
15
lib/pleroma/workers/rich_media_expiration_worker.ex
Normal file
15
lib/pleroma/workers/rich_media_expiration_worker.ex
Normal file
|
@ -0,0 +1,15 @@
|
|||
# Pleroma: A lightweight social networking server
|
||||
# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
|
||||
# SPDX-License-Identifier: AGPL-3.0-only
|
||||
|
||||
defmodule Pleroma.Workers.RichMediaExpirationWorker do
|
||||
alias Pleroma.Web.RichMedia.Card
|
||||
|
||||
use Oban.Worker,
|
||||
queue: :rich_media_expiration
|
||||
|
||||
@impl Oban.Worker
|
||||
def perform(%Job{args: %{"url" => url} = _args}) do
|
||||
Card.delete(url)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,14 @@
|
|||
defmodule Pleroma.Repo.Migrations.CreateRichMediaCard do
|
||||
use Ecto.Migration
|
||||
|
||||
def change do
|
||||
create table(:rich_media_card) do
|
||||
add(:url_hash, :bytea)
|
||||
add(:fields, :map)
|
||||
|
||||
timestamps()
|
||||
end
|
||||
|
||||
create(unique_index(:rich_media_card, [:url_hash]))
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue