Merge branch 'feat/floki-fasthtml' into 'develop'

Make Floki use fast_html

See merge request pleroma/pleroma!2194
This commit is contained in:
lain 2020-02-11 13:22:35 +00:00
commit 3fee859b60
9 changed files with 35 additions and 18 deletions

View file

@ -612,6 +612,8 @@
config :pleroma, configurable_from_database: false config :pleroma, configurable_from_database: false
config :floki, :html_parser, Floki.HTMLParser.FastHtml
# Import environment specific config. This must remain at the bottom # Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above. # of this file so it overrides the configuration defined above.
import_config "#{Mix.env()}.exs" import_config "#{Mix.env()}.exs"

View file

@ -108,6 +108,7 @@ def extract_first_external_url(object, content) do
Cachex.fetch!(:scrubber_cache, key, fn _key -> Cachex.fetch!(:scrubber_cache, key, fn _key ->
result = result =
content content
|> Floki.parse_fragment!()
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]") |> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")
|> Floki.attribute("a", "href") |> Floki.attribute("a", "href")
|> Enum.at(0) |> Enum.at(0)

View file

@ -17,6 +17,7 @@ defp old_user?(%User{} = u) do
# does the post contain links? # does the post contain links?
defp contains_links?(%{"content" => content} = _object) do defp contains_links?(%{"content" => content} = _object) do
content content
|> Floki.parse_fragment!()
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl") |> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl")
|> Floki.attribute("a", "href") |> Floki.attribute("a", "href")
|> length() > 0 |> length() > 0

View file

@ -8,8 +8,10 @@ defmodule Pleroma.Web.Metadata.Providers.RelMe do
@impl Provider @impl Provider
def build_tags(%{user: user}) do def build_tags(%{user: user}) do
(Floki.attribute(user.bio, "link[rel~=me]", "href") ++ bio_tree = Floki.parse_fragment!(user.bio)
Floki.attribute(user.bio, "a[rel~=me]", "href"))
(Floki.attribute(bio_tree, "link[rel~=me]", "href") ++
Floki.attribute(bio_tree, "a[rel~=me]", "href"))
|> Enum.map(fn link -> |> Enum.map(fn link ->
{:link, [rel: "me", href: link], []} {:link, [rel: "me", href: link], []}
end) end)

View file

@ -27,9 +27,10 @@ def parse(_), do: {:error, "No URL provided"}
defp parse_url(url) do defp parse_url(url) do
with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <- with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <-
Pleroma.HTTP.get(url, [], adapter: @hackney_options), Pleroma.HTTP.get(url, [], adapter: @hackney_options),
{:ok, html_tree} <- Floki.parse_document(html),
data <- data <-
Floki.attribute(html, "link[rel~=me]", "href") ++ Floki.attribute(html_tree, "link[rel~=me]", "href") ++
Floki.attribute(html, "a[rel~=me]", "href") do Floki.attribute(html_tree, "a[rel~=me]", "href") do
{:ok, data} {:ok, data}
end end
rescue rescue

View file

@ -81,18 +81,18 @@ defp parse_url(url) do
{:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options) {:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options)
html html
|> parse_html |> parse_html()
|> maybe_parse() |> maybe_parse()
|> Map.put(:url, url) |> Map.put(:url, url)
|> clean_parsed_data() |> clean_parsed_data()
|> check_parsed_data() |> check_parsed_data()
rescue rescue
e -> e ->
{:error, "Parsing error: #{inspect(e)}"} {:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
end end
end end
defp parse_html(html), do: Floki.parse(html) defp parse_html(html), do: Floki.parse_document!(html)
defp maybe_parse(html) do defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc -> Enum.reduce_while(parsers(), %{}, fn parser, acc ->

View file

@ -139,7 +139,7 @@ defp deps do
{:phoenix_swoosh, "~> 0.2"}, {:phoenix_swoosh, "~> 0.2"},
{:gen_smtp, "~> 0.13"}, {:gen_smtp, "~> 0.13"},
{:websocket_client, git: "https://github.com/jeremyong/websocket_client.git", only: :test}, {:websocket_client, git: "https://github.com/jeremyong/websocket_client.git", only: :test},
{:floki, "~> 0.23.0"}, {:floki, "~> 0.25"},
{:ex_syslogger, github: "slashmili/ex_syslogger", tag: "1.4.0"}, {:ex_syslogger, github: "slashmili/ex_syslogger", tag: "1.4.0"},
{:timex, "~> 3.5"}, {:timex, "~> 3.5"},
{:ueberauth, "~> 0.4"}, {:ueberauth, "~> 0.4"},

View file

@ -37,16 +37,16 @@
"ex_machina": {:hex, :ex_machina, "2.3.0", "92a5ad0a8b10ea6314b876a99c8c9e3f25f4dde71a2a835845b136b9adaf199a", [:mix], [{:ecto, "~> 2.2 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:ecto_sql, "~> 3.0", [hex: :ecto_sql, repo: "hexpm", optional: true]}], "hexpm"}, "ex_machina": {:hex, :ex_machina, "2.3.0", "92a5ad0a8b10ea6314b876a99c8c9e3f25f4dde71a2a835845b136b9adaf199a", [:mix], [{:ecto, "~> 2.2 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:ecto_sql, "~> 3.0", [hex: :ecto_sql, repo: "hexpm", optional: true]}], "hexpm"},
"ex_syslogger": {:git, "https://github.com/slashmili/ex_syslogger.git", "f3963399047af17e038897c69e20d552e6899e1d", [tag: "1.4.0"]}, "ex_syslogger": {:git, "https://github.com/slashmili/ex_syslogger.git", "f3963399047af17e038897c69e20d552e6899e1d", [tag: "1.4.0"]},
"excoveralls": {:hex, :excoveralls, "0.12.1", "a553c59f6850d0aff3770e4729515762ba7c8e41eedde03208182a8dc9d0ce07", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm"}, "excoveralls": {:hex, :excoveralls, "0.12.1", "a553c59f6850d0aff3770e4729515762ba7c8e41eedde03208182a8dc9d0ce07", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm"},
"fast_html": {:hex, :fast_html, "0.99.4", "d80812664f0429607e1d880fba0ef04da87a2e4fa596701bcaae17953535695c", [:make, :mix], [], "hexpm"}, "fast_html": {:hex, :fast_html, "1.0.2", "b2a32022741699421e90762ce904cacb4faf12c10129acc3674262dd7fa5d2b6", [:make, :mix], [], "hexpm"},
"fast_sanitize": {:hex, :fast_sanitize, "0.1.4", "6c2e7203ca2f8275527a3021ba6e9d5d4ee213a47dc214a97c128737c9e56df1", [:mix], [{:fast_html, "~> 0.99", [hex: :fast_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.8", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm"}, "fast_sanitize": {:hex, :fast_sanitize, "0.1.7", "2a7cd8734c88a2de6de55022104f8a3b87f1fdbe8bbf131d9049764b53d50d0d", [:mix], [{:fast_html, "~> 1.0", [hex: :fast_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.8", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm"},
"flake_id": {:hex, :flake_id, "0.1.0", "7716b086d2e405d09b647121a166498a0d93d1a623bead243e1f74216079ccb3", [:mix], [{:base62, "~> 1.2", [hex: :base62, repo: "hexpm", optional: false]}, {:ecto, ">= 2.0.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"}, "flake_id": {:hex, :flake_id, "0.1.0", "7716b086d2e405d09b647121a166498a0d93d1a623bead243e1f74216079ccb3", [:mix], [{:base62, "~> 1.2", [hex: :base62, repo: "hexpm", optional: false]}, {:ecto, ">= 2.0.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
"floki": {:hex, :floki, "0.23.1", "e100306ce7d8841d70a559748e5091542e2cfc67ffb3ade92b89a8435034dab1", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm"}, "floki": {:hex, :floki, "0.25.0", "b1c9ddf5f32a3a90b43b76f3386ca054325dc2478af020e87b5111c19f2284ac", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm"},
"gen_smtp": {:hex, :gen_smtp, "0.15.0", "9f51960c17769b26833b50df0b96123605a8024738b62db747fece14eb2fbfcc", [:rebar3], [], "hexpm"}, "gen_smtp": {:hex, :gen_smtp, "0.15.0", "9f51960c17769b26833b50df0b96123605a8024738b62db747fece14eb2fbfcc", [:rebar3], [], "hexpm"},
"gen_stage": {:hex, :gen_stage, "0.14.3", "d0c66f1c87faa301c1a85a809a3ee9097a4264b2edf7644bf5c123237ef732bf", [:mix], [], "hexpm"}, "gen_stage": {:hex, :gen_stage, "0.14.3", "d0c66f1c87faa301c1a85a809a3ee9097a4264b2edf7644bf5c123237ef732bf", [:mix], [], "hexpm"},
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"}, "gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
"gettext": {:hex, :gettext, "0.17.1", "8baab33482df4907b3eae22f719da492cee3981a26e649b9c2be1c0192616962", [:mix], [], "hexpm"}, "gettext": {:hex, :gettext, "0.17.1", "8baab33482df4907b3eae22f719da492cee3981a26e649b9c2be1c0192616962", [:mix], [], "hexpm"},
"hackney": {:hex, :hackney, "1.15.2", "07e33c794f8f8964ee86cebec1a8ed88db5070e52e904b8f12209773c1036085", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.5", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, "hackney": {:hex, :hackney, "1.15.2", "07e33c794f8f8964ee86cebec1a8ed88db5070e52e904b8f12209773c1036085", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.5", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"html_entities": {:hex, :html_entities, "0.5.0", "40f5c5b9cbe23073b48a4e69c67b6c11974f623a76165e2b92d098c0e88ccb1d", [:mix], [], "hexpm"}, "html_entities": {:hex, :html_entities, "0.5.1", "1c9715058b42c35a2ab65edc5b36d0ea66dd083767bef6e3edb57870ef556549", [:mix], [], "hexpm"},
"html_sanitize_ex": {:hex, :html_sanitize_ex, "1.3.0", "f005ad692b717691203f940c686208aa3d8ffd9dd4bb3699240096a51fa9564e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"}, "html_sanitize_ex": {:hex, :html_sanitize_ex, "1.3.0", "f005ad692b717691203f940c686208aa3d8ffd9dd4bb3699240096a51fa9564e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"},
"http_signatures": {:git, "https://git.pleroma.social/pleroma/http_signatures.git", "293d77bb6f4a67ac8bde1428735c3b42f22cbb30", [ref: "293d77bb6f4a67ac8bde1428735c3b42f22cbb30"]}, "http_signatures": {:git, "https://git.pleroma.social/pleroma/http_signatures.git", "293d77bb6f4a67ac8bde1428735c3b42f22cbb30", [ref: "293d77bb6f4a67ac8bde1428735c3b42f22cbb30"]},
"httpoison": {:hex, :httpoison, "1.6.1", "2ce5bf6e535cd0ab02e905ba8c276580bab80052c5c549f53ddea52d72e81f33", [:mix], [{:hackney, "~> 1.15 and >= 1.15.2", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, "httpoison": {:hex, :httpoison, "1.6.1", "2ce5bf6e535cd0ab02e905ba8c276580bab80052c5c549f53ddea52d72e81f33", [:mix], [{:hackney, "~> 1.15 and >= 1.15.2", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},

View file

@ -7,11 +7,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
alias Pleroma.Web.RichMedia.Parsers.TwitterCard alias Pleroma.Web.RichMedia.Parsers.TwitterCard
test "returns error when html not contains twitter card" do test "returns error when html not contains twitter card" do
assert TwitterCard.parse("", %{}) == {:error, "No twitter card metadata found"} assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) ==
{:error, "No twitter card metadata found"}
end end
test "parses twitter card with only name attributes" do test "parses twitter card with only name attributes" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -26,7 +29,9 @@ test "parses twitter card with only name attributes" do
end end
test "parses twitter card with only property attributes" do test "parses twitter card with only property attributes" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -45,7 +50,9 @@ test "parses twitter card with only property attributes" do
end end
test "parses twitter card with name & property attributes" do test "parses twitter card with name & property attributes" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -73,7 +80,8 @@ test "respect only first title tag on the page" do
"YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <> "YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <>
"yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg" "yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg"
html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html") html =
File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -87,7 +95,9 @@ test "respect only first title tag on the page" do
end end
test "takes first founded title in html head if there is html markup error" do test "takes first founded title in html head if there is html markup error" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,