diff --git a/changelog.d/rich_media_refactor.change b/changelog.d/rich_media_refactor.change
new file mode 100644
index 000000000..c0d4e3b0a
--- /dev/null
+++ b/changelog.d/rich_media_refactor.change
@@ -0,0 +1 @@
+Refactored Rich Media to cache the content in the database. Fetching operations that could block status rendering have been eliminated.
diff --git a/config/config.exs b/config/config.exs
index a76d3e208..63544fc23 100644
--- a/config/config.exs
+++ b/config/config.exs
@@ -579,7 +579,8 @@
mute_expire: 5,
search_indexing: 10,
nodeinfo_fetcher: 1,
- database_prune: 1
+ database_prune: 1,
+ rich_media_expiration: 2
],
plugins: [
Oban.Plugins.Pruner,
diff --git a/config/test.exs b/config/test.exs
index 75751e115..8c30d5b85 100644
--- a/config/test.exs
+++ b/config/test.exs
@@ -141,6 +141,8 @@
config :pleroma, :instances_favicons, enabled: false
config :pleroma, :instances_nodeinfo, enabled: false
+config :pleroma, Pleroma.Web.RichMedia.Backfill, provider: Pleroma.Web.RichMedia.Backfill
+
if File.exists?("./config/test.secret.exs") do
import_config "test.secret.exs"
else
diff --git a/lib/pleroma/html.ex b/lib/pleroma/html.ex
index bee66169d..4972fb26c 100644
--- a/lib/pleroma/html.ex
+++ b/lib/pleroma/html.ex
@@ -67,22 +67,9 @@ def ensure_scrubbed_html(
end
end
- def extract_first_external_url_from_object(%{data: %{"content" => content}} = object)
+ @spec extract_first_external_url_from_object(Pleroma.Object.t()) :: String.t() | nil
+ def extract_first_external_url_from_object(%{data: %{"content" => content}})
when is_binary(content) do
- unless object.data["fake"] do
- key = "URL|#{object.id}"
-
- @cachex.fetch!(:scrubber_cache, key, fn _key ->
- {:commit, {:ok, extract_first_external_url(content)}}
- end)
- else
- {:ok, extract_first_external_url(content)}
- end
- end
-
- def extract_first_external_url_from_object(_), do: {:error, :no_content}
-
- def extract_first_external_url(content) do
content
|> Floki.parse_fragment!()
|> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
@@ -90,4 +77,6 @@ def extract_first_external_url(content) do
|> Floki.attribute("href")
|> Enum.at(0)
end
+
+ def extract_first_external_url_from_object(_), do: nil
end
diff --git a/lib/pleroma/web/activity_pub/activity_pub.ex b/lib/pleroma/web/activity_pub/activity_pub.ex
index c1b7bc71a..c87072300 100644
--- a/lib/pleroma/web/activity_pub/activity_pub.ex
+++ b/lib/pleroma/web/activity_pub/activity_pub.ex
@@ -155,9 +155,7 @@ def insert(map, local \\ true, fake \\ false, bypass_actor_check \\ false) when
# Splice in the child object if we have one.
activity = Maps.put_if_present(activity, :object, object)
- ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn ->
- Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end)
- end)
+ Pleroma.Web.RichMedia.Card.get_by_activity(activity)
# Add local posts to search index
if local, do: Pleroma.Search.add_to_index(activity)
@@ -185,7 +183,7 @@ def insert(map, local \\ true, fake \\ false, bypass_actor_check \\ false) when
id: "pleroma:fakeid"
}
- Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
+ Pleroma.Web.RichMedia.Card.get_by_activity(activity)
{:ok, activity}
{:remote_limit_pass, _} ->
diff --git a/lib/pleroma/web/activity_pub/side_effects.ex b/lib/pleroma/web/activity_pub/side_effects.ex
index 963c6d8c6..0e85e47be 100644
--- a/lib/pleroma/web/activity_pub/side_effects.ex
+++ b/lib/pleroma/web/activity_pub/side_effects.ex
@@ -225,9 +225,7 @@ def handle(%{data: %{"type" => "Create"}} = activity, meta) do
end
end
- ConcurrentLimiter.limit(Pleroma.Web.RichMedia.Helpers, fn ->
- Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end)
- end)
+ Pleroma.Web.RichMedia.Card.get_by_activity(activity)
Pleroma.Search.add_to_index(Map.put(activity, :object, object))
diff --git a/lib/pleroma/web/mastodon_api/controllers/status_controller.ex b/lib/pleroma/web/mastodon_api/controllers/status_controller.ex
index acb5f15a0..c4e555c5e 100644
--- a/lib/pleroma/web/mastodon_api/controllers/status_controller.ex
+++ b/lib/pleroma/web/mastodon_api/controllers/status_controller.ex
@@ -25,6 +25,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusController do
alias Pleroma.Web.OAuth.Token
alias Pleroma.Web.Plugs.OAuthScopesPlug
alias Pleroma.Web.Plugs.RateLimiter
+ alias Pleroma.Web.RichMedia.Card
plug(Pleroma.Web.ApiSpec.CastAndValidate)
@@ -383,6 +384,21 @@ def unmute_conversation(%{assigns: %{user: user}} = conn, %{id: id}) do
end
end
+ @doc "GET /api/v1/statuses/:id/card"
+ @deprecated "https://github.com/tootsuite/mastodon/pull/11213"
+ def card(
+ %{assigns: %{user: user}, private: %{open_api_spex: %{params: %{id: status_id}}}} = conn,
+ _
+ ) do
+ with %Activity{} = activity <- Activity.get_by_id(status_id),
+ true <- Visibility.visible_for_user?(activity, user),
+ %Card{} = card_data <- Card.get_by_activity(activity) do
+ render(conn, "card.json", card_data)
+ else
+ _ -> render_error(conn, :not_found, "Record not found")
+ end
+ end
+
@doc "GET /api/v1/statuses/:id/favourited_by"
def favourited_by(%{assigns: %{user: user}} = conn, %{id: id}) do
with true <- Pleroma.Config.get([:instance, :show_reactions]),
diff --git a/lib/pleroma/web/mastodon_api/views/status_view.ex b/lib/pleroma/web/mastodon_api/views/status_view.ex
index ac0955534..cc70d4731 100644
--- a/lib/pleroma/web/mastodon_api/views/status_view.ex
+++ b/lib/pleroma/web/mastodon_api/views/status_view.ex
@@ -22,6 +22,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
alias Pleroma.Web.MediaProxy
alias Pleroma.Web.PleromaAPI.EmojiReactionController
require Logger
+ alias Pleroma.Web.RichMedia.Card
import Pleroma.Web.ActivityPub.Visibility, only: [get_visibility: 1, visible_for_user?: 2]
@@ -30,9 +31,7 @@ defmodule Pleroma.Web.MastodonAPI.StatusView do
# pagination is restricted to 40 activities at a time
defp fetch_rich_media_for_activities(activities) do
Enum.each(activities, fn activity ->
- spawn(fn ->
- Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
- end)
+ spawn(fn -> Card.get_by_activity(activity) end)
end)
end
@@ -93,9 +92,7 @@ def render("index.json", opts) do
# To do: check AdminAPIControllerTest on the reasons behind nil activities in the list
activities = Enum.filter(opts.activities, & &1)
- # Start fetching rich media before doing anything else, so that later calls to get the cards
- # only block for timeout in the worst case, as opposed to
- # length(activities_with_links) * timeout
+ # Start prefetching rich media before doing anything else
fetch_rich_media_for_activities(activities)
replied_to_activities = get_replied_to_activities(activities)
@@ -301,12 +298,82 @@ def render("show.json", %{activity: %{id: id, data: %{"object" => _object}} = ac
object
|> render_content()
- content_html =
- content
- |> Activity.HTML.get_cached_scrubbed_html_for_activity(
- User.html_filter_policy(opts[:for]),
- activity,
- "mastoapi:content:#{chrono_order}"
+ quote_post =
+ if visible_for_user?(quote_activity, opts[:for]) and opts[:show_quote] != false do
+ quote_rendering_opts = Map.merge(opts, %{activity: quote_activity, show_quote: false})
+ render("show.json", quote_rendering_opts)
+ else
+ nil
+ end
+
+ content =
+ object
+ |> render_content()
+
+ content_html =
+ content
+ |> Activity.HTML.get_cached_scrubbed_html_for_activity(
+ User.html_filter_policy(opts[:for]),
+ activity,
+ "mastoapi:content:#{chrono_order}"
+ )
+
+ content_plaintext =
+ content
+ |> Activity.HTML.get_cached_stripped_html_for_activity(
+ activity,
+ "mastoapi:content:#{chrono_order}"
+ )
+
+ summary = object.data["summary"] || ""
+
+ card =
+ case Card.get_by_activity(activity) do
+ %Card{} = result -> render("card.json", result)
+ _ -> nil
+ end
+
+ url =
+ if user.local do
+ Pleroma.Web.Router.Helpers.o_status_url(Pleroma.Web.Endpoint, :notice, activity)
+ else
+ object.data["url"] || object.data["external_url"] || object.data["id"]
+ end
+
+ direct_conversation_id =
+ with {_, nil} <- {:direct_conversation_id, opts[:direct_conversation_id]},
+ {_, true} <- {:include_id, opts[:with_direct_conversation_id]},
+ {_, %User{} = for_user} <- {:for_user, opts[:for]} do
+ Activity.direct_conversation_id(activity, for_user)
+ else
+ {:direct_conversation_id, participation_id} when is_integer(participation_id) ->
+ participation_id
+
+ _e ->
+ nil
+ end
+
+ emoji_reactions =
+ object
+ |> Object.get_emoji_reactions()
+ |> EmojiReactionController.filter_allowed_users(
+ opts[:for],
+ Map.get(opts, :with_muted, false)
+ )
+ |> Stream.map(fn {emoji, users, url} ->
+ build_emoji_map(emoji, users, url, opts[:for])
+ end)
+ |> Enum.to_list()
+
+ # Status muted state (would do 1 request per status unless user mutes are preloaded)
+ muted =
+ thread_muted? ||
+ UserRelationship.exists?(
+ get_in(opts, [:relationships, :user_relationships]),
+ :mute,
+ opts[:for],
+ user,
+ fn for_user, user -> User.mutes?(for_user, user) end
)
content_plaintext =
@@ -528,15 +595,8 @@ def render("source.json", %{activity: %{data: %{"object" => _object}} = activity
}
end
- def render("card.json", %{rich_media: rich_media, page_url: page_url}) do
- page_url_data = URI.parse(page_url)
-
- page_url_data =
- if is_binary(rich_media["url"]) do
- URI.merge(page_url_data, URI.parse(rich_media["url"]))
- else
- page_url_data
- end
+ def render("card.json", %Card{fields: rich_media}) do
+ page_url_data = URI.parse(rich_media["url"])
page_url = page_url_data |> to_string
diff --git a/lib/pleroma/web/rich_media/backfill.ex b/lib/pleroma/web/rich_media/backfill.ex
new file mode 100644
index 000000000..112028901
--- /dev/null
+++ b/lib/pleroma/web/rich_media/backfill.ex
@@ -0,0 +1,101 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Backfill.Task do
+ alias Pleroma.Web.RichMedia.Backfill
+
+ def run(args) do
+ Task.Supervisor.start_child(Pleroma.TaskSupervisor, Backfill, :run, [args],
+ name: {:global, {:rich_media, args.url_hash}}
+ )
+ end
+end
+
+defmodule Pleroma.Web.RichMedia.Backfill do
+ alias Pleroma.Web.RichMedia.Card
+ alias Pleroma.Web.RichMedia.Parser
+ alias Pleroma.Web.RichMedia.Parser.TTL
+ alias Pleroma.Workers.RichMediaExpirationWorker
+
+ require Logger
+
+ @backfiller Pleroma.Config.get([__MODULE__, :provider], Pleroma.Web.RichMedia.Backfill.Task)
+ @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
+ @max_attempts 3
+ @retry 5_000
+
+ def start(%{url: url} = args) when is_binary(url) do
+ url_hash = Card.url_to_hash(url)
+
+ args =
+ args
+ |> Map.put(:attempt, 1)
+ |> Map.put(:url_hash, url_hash)
+
+ @backfiller.run(args)
+ end
+
+ def run(%{url: url, url_hash: url_hash, attempt: attempt} = args)
+ when attempt <= @max_attempts do
+ case Parser.parse(url) do
+ {:ok, fields} ->
+ {:ok, card} = Card.create(url, fields)
+
+ maybe_schedule_expiration(url, fields)
+
+ if Map.has_key?(args, :activity_id) do
+ stream_update(args)
+ end
+
+ warm_cache(url_hash, card)
+
+ {:error, {:invalid_metadata, fields}} ->
+ Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}")
+ negative_cache(url_hash)
+
+ {:error, :body_too_large} ->
+ Logger.error("Rich media error for #{url}: :body_too_large")
+ negative_cache(url_hash)
+
+ {:error, {:content_type, type}} ->
+ Logger.debug("Rich media error for #{url}: :content_type is #{type}")
+ negative_cache(url_hash)
+
+ e ->
+ Logger.debug("Rich media error for #{url}: #{inspect(e)}")
+
+ :timer.sleep(@retry * attempt)
+
+ run(%{args | attempt: attempt + 1})
+ end
+ end
+
+ def run(%{url: url, url_hash: url_hash}) do
+ Logger.debug("Rich media failure for #{url}")
+
+ negative_cache(url_hash, :timer.minutes(15))
+ end
+
+ defp maybe_schedule_expiration(url, fields) do
+ case TTL.get_from_image(fields, url) do
+ ttl when is_number(ttl) ->
+ timestamp = DateTime.from_unix!(ttl)
+
+ RichMediaExpirationWorker.new(%{"url" => url}, scheduled_at: timestamp)
+ |> Oban.insert()
+
+ _ ->
+ :ok
+ end
+ end
+
+ defp stream_update(%{activity_id: activity_id}) do
+ Pleroma.Activity.get_by_id(activity_id)
+ |> Pleroma.Activity.normalize()
+ |> Pleroma.Web.ActivityPub.ActivityPub.stream_out()
+ end
+
+ defp warm_cache(key, val), do: @cachex.put(:rich_media_cache, key, val)
+ defp negative_cache(key, ttl \\ nil), do: @cachex.put(:rich_media_cache, key, nil, ttl: ttl)
+end
diff --git a/lib/pleroma/web/rich_media/card.ex b/lib/pleroma/web/rich_media/card.ex
new file mode 100644
index 000000000..2d36f2b62
--- /dev/null
+++ b/lib/pleroma/web/rich_media/card.ex
@@ -0,0 +1,157 @@
+defmodule Pleroma.Web.RichMedia.Card do
+ use Ecto.Schema
+ import Ecto.Changeset
+ import Ecto.Query
+
+ alias Pleroma.Activity
+ alias Pleroma.HTML
+ alias Pleroma.Object
+ alias Pleroma.Repo
+ alias Pleroma.Web.RichMedia.Backfill
+ alias Pleroma.Web.RichMedia.Parser
+
+ @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
+ @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
+
+ @type t :: %__MODULE__{}
+
+ schema "rich_media_card" do
+ field(:url_hash, :binary)
+ field(:fields, :map)
+
+ timestamps()
+ end
+
+ @doc false
+ def changeset(card, attrs) do
+ card
+ |> cast(attrs, [:url_hash, :fields])
+ |> validate_required([:url_hash, :fields])
+ |> unique_constraint(:url_hash)
+ end
+
+ @spec create(String.t(), map()) :: {:ok, t()}
+ def create(url, fields) do
+ url_hash = url_to_hash(url)
+
+ fields = Map.put_new(fields, "url", url)
+
+ %__MODULE__{}
+ |> changeset(%{url_hash: url_hash, fields: fields})
+ |> Repo.insert(on_conflict: {:replace, [:fields]}, conflict_target: :url_hash)
+ end
+
+ @spec delete(String.t()) :: {:ok, Ecto.Schema.t()} | {:error, Ecto.Changeset.t()} | :ok
+ def delete(url) do
+ url_hash = url_to_hash(url)
+ @cachex.del(:rich_media_cache, url_hash)
+
+ case get_by_url(url) do
+ %__MODULE{} = card -> Repo.delete(card)
+ nil -> :ok
+ end
+ end
+
+ @spec get_by_url(String.t() | nil) :: t() | nil | :error
+ def get_by_url(url) when is_binary(url) do
+ if @config_impl.get([:rich_media, :enabled]) do
+ url_hash = url_to_hash(url)
+
+ @cachex.fetch!(:rich_media_cache, url_hash, fn _ ->
+ result =
+ __MODULE__
+ |> where(url_hash: ^url_hash)
+ |> Repo.one()
+
+ case result do
+ %__MODULE__{} = card -> {:commit, card}
+ _ -> {:ignore, nil}
+ end
+ end)
+ else
+ :error
+ end
+ end
+
+ def get_by_url(nil), do: nil
+
+ @spec get_or_backfill_by_url(String.t(), map()) :: t() | nil
+ def get_or_backfill_by_url(url, backfill_opts \\ %{}) do
+ case get_by_url(url) do
+ %__MODULE__{} = card ->
+ card
+
+ nil ->
+ backfill_opts = Map.put(backfill_opts, :url, url)
+
+ Backfill.start(backfill_opts)
+
+ nil
+
+ :error ->
+ nil
+ end
+ end
+
+ @spec get_by_object(Object.t()) :: t() | nil | :error
+ def get_by_object(object) do
+ case HTML.extract_first_external_url_from_object(object) do
+ nil -> nil
+ url -> get_or_backfill_by_url(url)
+ end
+ end
+
+ @spec get_by_activity(Activity.t()) :: t() | nil | :error
+ # Fake/Draft activity
+ def get_by_activity(%Activity{id: "pleroma:fakeid"} = activity) do
+ with %Object{} = object <- Object.normalize(activity, fetch: false),
+ url when not is_nil(url) <- HTML.extract_first_external_url_from_object(object) do
+ case get_by_url(url) do
+ # Cache hit
+ %__MODULE__{} = card ->
+ card
+
+ # Cache miss, but fetch for rendering the Draft
+ _ ->
+ with {:ok, fields} <- Parser.parse(url),
+ {:ok, card} <- create(url, fields) do
+ card
+ else
+ _ -> nil
+ end
+ end
+ else
+ _ ->
+ nil
+ end
+ end
+
+ def get_by_activity(activity) do
+ with %Object{} = object <- Object.normalize(activity, fetch: false),
+ {_, nil} <- {:cached, get_cached_url(object, activity.id)} do
+ nil
+ else
+ {:cached, url} ->
+ get_or_backfill_by_url(url, %{activity_id: activity.id})
+
+ _ ->
+ :error
+ end
+ end
+
+ @spec url_to_hash(String.t()) :: String.t()
+ def url_to_hash(url) do
+ :crypto.hash(:sha256, url) |> Base.encode16(case: :lower)
+ end
+
+ defp get_cached_url(object, activity_id) do
+ key = "URL|#{activity_id}"
+
+ @cachex.fetch!(:scrubber_cache, key, fn _ ->
+ url = HTML.extract_first_external_url_from_object(object)
+ Activity.HTML.add_cache_key_for(activity_id, key)
+
+ {:commit, url}
+ end)
+ end
+end
diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex
index 061c1a795..68aae5e3e 100644
--- a/lib/pleroma/web/rich_media/helpers.ex
+++ b/lib/pleroma/web/rich_media/helpers.ex
@@ -3,85 +3,13 @@
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Helpers do
- alias Pleroma.Activity
alias Pleroma.Config
- alias Pleroma.HTML
- alias Pleroma.Object
- alias Pleroma.Web.RichMedia.Parser
-
- @options [
- max_body: 2_000_000,
- receive_timeout: 2_000
- ]
-
- @spec validate_page_url(URI.t() | binary()) :: :ok | :error
- defp validate_page_url(page_url) when is_binary(page_url) do
- validate_tld = Config.get([Pleroma.Formatter, :validate_tld])
-
- page_url
- |> Linkify.Parser.url?(validate_tld: validate_tld)
- |> parse_uri(page_url)
- end
-
- defp validate_page_url(%URI{host: host, scheme: "https", authority: authority})
- when is_binary(authority) do
- cond do
- host in Config.get([:rich_media, :ignore_hosts], []) ->
- :error
-
- get_tld(host) in Config.get([:rich_media, :ignore_tld], []) ->
- :error
-
- true ->
- :ok
- end
- end
-
- defp validate_page_url(_), do: :error
-
- defp parse_uri(true, url) do
- url
- |> URI.parse()
- |> validate_page_url
- end
-
- defp parse_uri(_, _), do: :error
-
- defp get_tld(host) do
- host
- |> String.split(".")
- |> Enum.reverse()
- |> hd
- end
-
- def fetch_data_for_object(object) do
- with true <- Config.get([:rich_media, :enabled]),
- {:ok, page_url} <-
- HTML.extract_first_external_url_from_object(object),
- :ok <- validate_page_url(page_url),
- {:ok, rich_media} <- Parser.parse(page_url) do
- %{page_url: page_url, rich_media: rich_media}
- else
- _ -> %{}
- end
- end
-
- def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do
- with true <- Config.get([:rich_media, :enabled]),
- %Object{} = object <- Object.normalize(activity, fetch: false) do
- fetch_data_for_object(object)
- else
- _ -> %{}
- end
- end
-
- def fetch_data_for_activity(_), do: %{}
def rich_media_get(url) do
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
head_check =
- case Pleroma.HTTP.head(url, headers, @options) do
+ case Pleroma.HTTP.head(url, headers, http_options()) do
# If the HEAD request didn't reach the server for whatever reason,
# we assume the GET that comes right after won't either
{:error, _} = e ->
@@ -96,7 +24,7 @@ def rich_media_get(url) do
:ok
end
- with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, @options)
+ with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
end
defp check_content_type(headers) do
@@ -112,12 +40,13 @@ defp check_content_type(headers) do
end
end
- @max_body @options[:max_body]
defp check_content_length(headers) do
+ max_body = Keyword.get(http_options(), :max_body)
+
case List.keyfind(headers, "content-length", 0) do
{_, maybe_content_length} ->
case Integer.parse(maybe_content_length) do
- {content_length, ""} when content_length <= @max_body -> :ok
+ {content_length, ""} when content_length <= max_body -> :ok
{_, ""} -> {:error, :body_too_large}
_ -> :ok
end
@@ -126,4 +55,11 @@ defp check_content_length(headers) do
:ok
end
end
+
+ defp http_options() do
+ [
+ pool: :media,
+ max_body: Config.get([:rich_media, :max_body], 2_000_000)
+ ]
+ end
end
diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex
index 3ba0086f0..0fb09f5a8 100644
--- a/lib/pleroma/web/rich_media/parser.ex
+++ b/lib/pleroma/web/rich_media/parser.ex
@@ -5,137 +5,28 @@
defmodule Pleroma.Web.RichMedia.Parser do
require Logger
- @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
+ @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
defp parsers do
Pleroma.Config.get([:rich_media, :parsers])
end
- def parse(nil), do: {:error, "No URL provided"}
+ def parse(nil), do: nil
- if Pleroma.Config.get(:env) == :test do
- @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
- def parse(url), do: parse_with_timeout(url)
- else
- @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
- def parse(url) do
- with {:ok, data} <- get_cached_or_parse(url),
- {:ok, _} <- set_ttl_based_on_image(data, url) do
- {:ok, data}
- end
- end
-
- defp get_cached_or_parse(url) do
- case @cachex.fetch(:rich_media_cache, url, fn ->
- case parse_with_timeout(url) do
- {:ok, _} = res ->
- {:commit, res}
-
- {:error, reason} = e ->
- # Unfortunately we have to log errors here, instead of doing that
- # along with ttl setting at the bottom. Otherwise we can get log spam
- # if more than one process was waiting for the rich media card
- # while it was generated. Ideally we would set ttl here as well,
- # so we don't override it number_of_waiters_on_generation
- # times, but one, obviously, can't set ttl for not-yet-created entry
- # and Cachex doesn't support returning ttl from the fetch callback.
- log_error(url, reason)
- {:commit, e}
- end
- end) do
- {action, res} when action in [:commit, :ok] ->
- case res do
- {:ok, _data} = res ->
- res
-
- {:error, reason} = e ->
- if action == :commit, do: set_error_ttl(url, reason)
- e
- end
-
- {:error, e} ->
- {:error, {:cachex_error, e}}
- end
- end
-
- defp set_error_ttl(_url, :body_too_large), do: :ok
- defp set_error_ttl(_url, {:content_type, _}), do: :ok
-
- # The TTL is not set for the errors above, since they are unlikely to change
- # with time
-
- defp set_error_ttl(url, _reason) do
- ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
- @cachex.expire(:rich_media_cache, url, ttl)
- :ok
- end
-
- defp log_error(url, {:invalid_metadata, data}) do
- Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
- end
-
- defp log_error(url, reason) do
- Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
+ @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
+ def parse(url) do
+ with :ok <- validate_page_url(url),
+ {:ok, data} <- parse_url(url) do
+ data = Map.put(data, "url", url)
+ {:ok, data}
end
end
- @doc """
- Set the rich media cache based on the expiration time of image.
-
- Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`
-
- ## Example
-
- defmodule MyModule do
- @behaviour Pleroma.Web.RichMedia.Parser.TTL
- def ttl(data, url) do
- image_url = Map.get(data, :image)
- # do some parsing in the url and get the ttl of the image
- # and return ttl is unix time
- parse_ttl_from_url(image_url)
- end
- end
-
- Define the module in the config
-
- config :pleroma, :rich_media,
- ttl_setters: [MyModule]
- """
- @spec set_ttl_based_on_image(map(), String.t()) ::
- {:ok, Integer.t() | :noop} | {:error, :no_key}
- def set_ttl_based_on_image(data, url) do
- case get_ttl_from_image(data, url) do
- {:ok, ttl} when is_number(ttl) ->
- ttl = ttl * 1000
-
- case @cachex.expire_at(:rich_media_cache, url, ttl) do
- {:ok, true} -> {:ok, ttl}
- {:ok, false} -> {:error, :no_key}
- end
-
- _ ->
- {:ok, :noop}
- end
- end
-
- defp get_ttl_from_image(data, url) do
- [:rich_media, :ttl_setters]
- |> Pleroma.Config.get()
- |> Enum.reduce({:ok, nil}, fn
- module, {:ok, _ttl} ->
- module.ttl(data, url)
-
- _, error ->
- error
- end)
- end
-
- def parse_url(url) do
+ defp parse_url(url) do
with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
{:ok, html} <- Floki.parse_document(html) do
html
|> maybe_parse()
- |> Map.put("url", url)
|> clean_parsed_data()
|> check_parsed_data()
end
diff --git a/lib/pleroma/web/rich_media/parser/ttl.ex b/lib/pleroma/web/rich_media/parser/ttl.ex
index 0b7f14fb2..de319e493 100644
--- a/lib/pleroma/web/rich_media/parser/ttl.ex
+++ b/lib/pleroma/web/rich_media/parser/ttl.ex
@@ -3,5 +3,17 @@
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parser.TTL do
- @callback ttl(Map.t(), String.t()) :: Integer.t() | nil
+ @callback ttl(map(), String.t()) :: integer() | nil
+
+ def get_from_image(data, url) do
+ [:rich_media, :ttl_setters]
+ |> Pleroma.Config.get()
+ |> Enum.reduce({:ok, nil}, fn
+ module, {:ok, _ttl} ->
+ module.ttl(data, url)
+
+ _, error ->
+ error
+ end)
+ end
end
diff --git a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex
index c7eb267f3..9fbeaadf2 100644
--- a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex
+++ b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex
@@ -7,7 +7,7 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do
@impl true
def ttl(data, _url) do
- image = Map.get(data, :image)
+ image = Map.get(data, "image")
if is_aws_signed_url(image) do
image
diff --git a/lib/pleroma/workers/rich_media_expiration_worker.ex b/lib/pleroma/workers/rich_media_expiration_worker.ex
new file mode 100644
index 000000000..d7ae497a7
--- /dev/null
+++ b/lib/pleroma/workers/rich_media_expiration_worker.ex
@@ -0,0 +1,15 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Workers.RichMediaExpirationWorker do
+ alias Pleroma.Web.RichMedia.Card
+
+ use Oban.Worker,
+ queue: :rich_media_expiration
+
+ @impl Oban.Worker
+ def perform(%Job{args: %{"url" => url} = _args}) do
+ Card.delete(url)
+ end
+end
diff --git a/priv/repo/migrations/20240207035927_create_rich_media_card.exs b/priv/repo/migrations/20240207035927_create_rich_media_card.exs
new file mode 100644
index 000000000..b5e48bccb
--- /dev/null
+++ b/priv/repo/migrations/20240207035927_create_rich_media_card.exs
@@ -0,0 +1,14 @@
+defmodule Pleroma.Repo.Migrations.CreateRichMediaCard do
+ use Ecto.Migration
+
+ def change do
+ create table(:rich_media_card) do
+ add(:url_hash, :bytea)
+ add(:fields, :map)
+
+ timestamps()
+ end
+
+ create(unique_index(:rich_media_card, [:url_hash]))
+ end
+end