From ea1631d7e67e22eb49d608e066ef4a3555bf25f7 Mon Sep 17 00:00:00 2001 From: rinpatch Date: Tue, 11 Feb 2020 00:29:25 +0300 Subject: [PATCH] Make Floki use fast_html --- config/config.exs | 2 ++ lib/pleroma/html.ex | 1 + .../activity_pub/mrf/anti_link_spam_policy.ex | 1 + lib/pleroma/web/metadata/rel_me.ex | 6 +++-- lib/pleroma/web/rel_me.ex | 5 +++-- lib/pleroma/web/rich_media/parser.ex | 6 ++--- .../rich_media/parsers/twitter_card_test.exs | 22 ++++++++++++++----- 7 files changed, 30 insertions(+), 13 deletions(-) diff --git a/config/config.exs b/config/config.exs index 41c1ff637..364aaf776 100644 --- a/config/config.exs +++ b/config/config.exs @@ -612,6 +612,8 @@ config :pleroma, configurable_from_database: false +config :floki, :html_parser, Floki.HTMLParser.FastHtml + # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. import_config "#{Mix.env()}.exs" diff --git a/lib/pleroma/html.ex b/lib/pleroma/html.ex index 11513106e..05946aa96 100644 --- a/lib/pleroma/html.ex +++ b/lib/pleroma/html.ex @@ -108,6 +108,7 @@ def extract_first_external_url(object, content) do Cachex.fetch!(:scrubber_cache, key, fn _key -> result = content + |> Floki.parse_fragment!() |> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]") |> Floki.attribute("a", "href") |> Enum.at(0) diff --git a/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex b/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex index 8abe18e29..802d10edc 100644 --- a/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex +++ b/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex @@ -17,6 +17,7 @@ defp old_user?(%User{} = u) do # does the post contain links? defp contains_links?(%{"content" => content} = _object) do content + |> Floki.parse_fragment!() |> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl") |> Floki.attribute("a", "href") |> length() > 0 diff --git a/lib/pleroma/web/metadata/rel_me.ex b/lib/pleroma/web/metadata/rel_me.ex index f87fc1973..86dcc1a3b 100644 --- a/lib/pleroma/web/metadata/rel_me.ex +++ b/lib/pleroma/web/metadata/rel_me.ex @@ -8,8 +8,10 @@ defmodule Pleroma.Web.Metadata.Providers.RelMe do @impl Provider def build_tags(%{user: user}) do - (Floki.attribute(user.bio, "link[rel~=me]", "href") ++ - Floki.attribute(user.bio, "a[rel~=me]", "href")) + bio_tree = Floki.parse_fragment!(user.bio) + + (Floki.attribute(bio_tree, "link[rel~=me]", "href") ++ + Floki.attribute(bio_tree, "a[rel~=me]", "href")) |> Enum.map(fn link -> {:link, [rel: "me", href: link], []} end) diff --git a/lib/pleroma/web/rel_me.ex b/lib/pleroma/web/rel_me.ex index 16b1a53d2..540fa65df 100644 --- a/lib/pleroma/web/rel_me.ex +++ b/lib/pleroma/web/rel_me.ex @@ -27,9 +27,10 @@ def parse(_), do: {:error, "No URL provided"} defp parse_url(url) do with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <- Pleroma.HTTP.get(url, [], adapter: @hackney_options), + {:ok, html_tree} <- Floki.parse_document(html), data <- - Floki.attribute(html, "link[rel~=me]", "href") ++ - Floki.attribute(html, "a[rel~=me]", "href") do + Floki.attribute(html_tree, "link[rel~=me]", "href") ++ + Floki.attribute(html_tree, "a[rel~=me]", "href") do {:ok, data} end rescue diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index c06b0a0f2..9702e90f1 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -81,18 +81,18 @@ defp parse_url(url) do {:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options) html - |> parse_html + |> parse_html() |> maybe_parse() |> Map.put(:url, url) |> clean_parsed_data() |> check_parsed_data() rescue e -> - {:error, "Parsing error: #{inspect(e)}"} + {:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"} end end - defp parse_html(html), do: Floki.parse(html) + defp parse_html(html), do: Floki.parse_document!(html) defp maybe_parse(html) do Enum.reduce_while(parsers(), %{}, fn parser, acc -> diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs index 751ca614c..f2ebbde7e 100644 --- a/test/web/rich_media/parsers/twitter_card_test.exs +++ b/test/web/rich_media/parsers/twitter_card_test.exs @@ -7,11 +7,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do alias Pleroma.Web.RichMedia.Parsers.TwitterCard test "returns error when html not contains twitter card" do - assert TwitterCard.parse("", %{}) == {:error, "No twitter card metadata found"} + assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == + {:error, "No twitter card metadata found"} end test "parses twitter card with only name attributes" do - html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") + html = + File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") + |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == {:ok, @@ -26,7 +29,9 @@ test "parses twitter card with only name attributes" do end test "parses twitter card with only property attributes" do - html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") + html = + File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") + |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == {:ok, @@ -45,7 +50,9 @@ test "parses twitter card with only property attributes" do end test "parses twitter card with name & property attributes" do - html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") + html = + File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") + |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == {:ok, @@ -73,7 +80,8 @@ test "respect only first title tag on the page" do "YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <> "yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg" - html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html") + html = + File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == {:ok, @@ -87,7 +95,9 @@ test "respect only first title tag on the page" do end test "takes first founded title in html head if there is html markup error" do - html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html") + html = + File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html") + |> Floki.parse_document!() assert TwitterCard.parse(html, %{}) == {:ok,