Oneric
0e55849a54
This should hopefully fix valid Announces/Likes/etc being dropped sometimes due to intermittent network errors while resolving the referenced object. Additionally this now includes ap_enabled purge, since it proved buggy and the resulting logs are distracting while screening for issues with the WIP patches.
142 lines
4.8 KiB
Diff
142 lines
4.8 KiB
Diff
From 69e5a336b3fc61d7af43b4e40c495701380e5450 Mon Sep 17 00:00:00 2001
|
|
From: Oneric <oneric@oneric.stub>
|
|
Date: Sat, 23 Nov 2024 19:04:11 +0100
|
|
Subject: [PATCH 09/22] rich_media: don't reattempt parsing on rejected URLs
|
|
|
|
---
|
|
lib/pleroma/web/rich_media/backfill.ex | 4 +++
|
|
lib/pleroma/web/rich_media/parser.ex | 15 ++++++------
|
|
test/pleroma/web/rich_media/parser_test.exs | 27 ++++++++++++++++-----
|
|
3 files changed, 33 insertions(+), 13 deletions(-)
|
|
|
|
diff --git a/lib/pleroma/web/rich_media/backfill.ex b/lib/pleroma/web/rich_media/backfill.ex
|
|
index 6b2373b01..8c54a0916 100644
|
|
--- a/lib/pleroma/web/rich_media/backfill.ex
|
|
+++ b/lib/pleroma/web/rich_media/backfill.ex
|
|
@@ -57,6 +57,10 @@ def run(%{"url" => url, "url_hash" => url_hash} = args) do
|
|
Logger.debug("Rich media error for #{url}: :content_type is #{type}")
|
|
negative_cache(url_hash, :timer.minutes(30))
|
|
|
|
+ {:error, {:url, reason}} ->
|
|
+ Logger.debug("Rich media error for #{url}: refusing URL #{inspect(reason)}")
|
|
+ negative_cache(url_hash, :timer.minutes(180))
|
|
+
|
|
e ->
|
|
Logger.debug("Rich media error for #{url}: #{inspect(e)}")
|
|
{:error, e}
|
|
diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex
|
|
index 7f6b5d388..7c5fed2bf 100644
|
|
--- a/lib/pleroma/web/rich_media/parser.ex
|
|
+++ b/lib/pleroma/web/rich_media/parser.ex
|
|
@@ -16,12 +16,13 @@ def parse(nil), do: nil
|
|
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
|
|
def parse(url) do
|
|
with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
|
|
- :ok <- validate_page_url(url),
|
|
+ {_, :ok} <- {:url, validate_page_url(url)},
|
|
{:ok, data} <- parse_url(url) do
|
|
data = Map.put(data, "url", url)
|
|
{:ok, data}
|
|
else
|
|
{:config, _} -> {:error, :rich_media_disabled}
|
|
+ {:url, {:error, reason}} -> {:error, {:url, reason}}
|
|
e -> e
|
|
end
|
|
end
|
|
@@ -62,7 +63,7 @@ defp clean_parsed_data(data) do
|
|
|> Map.new()
|
|
end
|
|
|
|
- @spec validate_page_url(URI.t() | binary()) :: :ok | :error
|
|
+ @spec validate_page_url(URI.t() | binary()) :: :ok | {:error, term()}
|
|
defp validate_page_url(page_url) when is_binary(page_url) do
|
|
validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld])
|
|
|
|
@@ -74,20 +75,20 @@ defp validate_page_url(page_url) when is_binary(page_url) do
|
|
defp validate_page_url(%URI{host: host, scheme: "https"}) do
|
|
cond do
|
|
Linkify.Parser.ip?(host) ->
|
|
- :error
|
|
+ {:error, :ip}
|
|
|
|
host in @config_impl.get([:rich_media, :ignore_hosts], []) ->
|
|
- :error
|
|
+ {:error, :ignore_hosts}
|
|
|
|
get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) ->
|
|
- :error
|
|
+ {:error, :ignore_tld}
|
|
|
|
true ->
|
|
:ok
|
|
end
|
|
end
|
|
|
|
- defp validate_page_url(_), do: :error
|
|
+ defp validate_page_url(_), do: {:error, "scheme mismatch"}
|
|
|
|
defp parse_uri(true, url) do
|
|
url
|
|
@@ -95,7 +96,7 @@ defp parse_uri(true, url) do
|
|
|> validate_page_url
|
|
end
|
|
|
|
- defp parse_uri(_, _), do: :error
|
|
+ defp parse_uri(_, _), do: {:error, "not an URL"}
|
|
|
|
defp get_tld(host) do
|
|
host
|
|
diff --git a/test/pleroma/web/rich_media/parser_test.exs b/test/pleroma/web/rich_media/parser_test.exs
|
|
index a5f2563a2..bf7864aa7 100644
|
|
--- a/test/pleroma/web/rich_media/parser_test.exs
|
|
+++ b/test/pleroma/web/rich_media/parser_test.exs
|
|
@@ -109,25 +109,40 @@ test "does a HEAD request to check if the body is html" do
|
|
|
|
test "refuses to crawl incomplete URLs" do
|
|
url = "example.com/ogp"
|
|
- assert :error == Parser.parse(url)
|
|
+ assert {:error, {:url, "scheme mismatch"}} == Parser.parse(url)
|
|
+ end
|
|
+
|
|
+ test "refuses to crawl plain HTTP and other scheme URL" do
|
|
+ [
|
|
+ "http://example.com/ogp",
|
|
+ "ftp://example.org/dist/"
|
|
+ ]
|
|
+ |> Enum.each(fn url ->
|
|
+ res = Parser.parse(url)
|
|
+
|
|
+ assert {:error, {:url, "scheme mismatch"}} == res or
|
|
+ {:error, {:url, "not an URL"}} == res
|
|
+ end)
|
|
end
|
|
|
|
test "refuses to crawl malformed URLs" do
|
|
url = "example.com[]/ogp"
|
|
- assert :error == Parser.parse(url)
|
|
+ assert {:error, {:url, "not an URL"}} == Parser.parse(url)
|
|
end
|
|
|
|
test "refuses to crawl URLs of private network from posts" do
|
|
[
|
|
- "http://127.0.0.1:4000/notice/9kCP7VNyPJXFOXDrgO",
|
|
+ "https://127.0.0.1:4000/notice/9kCP7VNyPJXFOXDrgO",
|
|
"https://10.111.10.1/notice/9kCP7V",
|
|
"https://172.16.32.40/notice/9kCP7V",
|
|
- "https://192.168.10.40/notice/9kCP7V",
|
|
- "https://pleroma.local/notice/9kCP7V"
|
|
+ "https://192.168.10.40/notice/9kCP7V"
|
|
]
|
|
|> Enum.each(fn url ->
|
|
- assert :error == Parser.parse(url)
|
|
+ assert {:error, {:url, :ip}} == Parser.parse(url)
|
|
end)
|
|
+
|
|
+ url = "https://pleroma.local/notice/9kCP7V"
|
|
+ assert {:error, {:url, :ignore_tld}} == Parser.parse(url)
|
|
end
|
|
|
|
test "returns error when disabled" do
|
|
--
|
|
2.39.5
|
|
|