Make Floki use fast_html
This commit is contained in:
parent
58299fcfb4
commit
ea1631d7e6
7 changed files with 30 additions and 13 deletions
|
@ -612,6 +612,8 @@
|
||||||
|
|
||||||
config :pleroma, configurable_from_database: false
|
config :pleroma, configurable_from_database: false
|
||||||
|
|
||||||
|
config :floki, :html_parser, Floki.HTMLParser.FastHtml
|
||||||
|
|
||||||
# Import environment specific config. This must remain at the bottom
|
# Import environment specific config. This must remain at the bottom
|
||||||
# of this file so it overrides the configuration defined above.
|
# of this file so it overrides the configuration defined above.
|
||||||
import_config "#{Mix.env()}.exs"
|
import_config "#{Mix.env()}.exs"
|
||||||
|
|
|
@ -108,6 +108,7 @@ def extract_first_external_url(object, content) do
|
||||||
Cachex.fetch!(:scrubber_cache, key, fn _key ->
|
Cachex.fetch!(:scrubber_cache, key, fn _key ->
|
||||||
result =
|
result =
|
||||||
content
|
content
|
||||||
|
|> Floki.parse_fragment!()
|
||||||
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")
|
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")
|
||||||
|> Floki.attribute("a", "href")
|
|> Floki.attribute("a", "href")
|
||||||
|> Enum.at(0)
|
|> Enum.at(0)
|
||||||
|
|
|
@ -17,6 +17,7 @@ defp old_user?(%User{} = u) do
|
||||||
# does the post contain links?
|
# does the post contain links?
|
||||||
defp contains_links?(%{"content" => content} = _object) do
|
defp contains_links?(%{"content" => content} = _object) do
|
||||||
content
|
content
|
||||||
|
|> Floki.parse_fragment!()
|
||||||
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl")
|
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl")
|
||||||
|> Floki.attribute("a", "href")
|
|> Floki.attribute("a", "href")
|
||||||
|> length() > 0
|
|> length() > 0
|
||||||
|
|
|
@ -8,8 +8,10 @@ defmodule Pleroma.Web.Metadata.Providers.RelMe do
|
||||||
|
|
||||||
@impl Provider
|
@impl Provider
|
||||||
def build_tags(%{user: user}) do
|
def build_tags(%{user: user}) do
|
||||||
(Floki.attribute(user.bio, "link[rel~=me]", "href") ++
|
bio_tree = Floki.parse_fragment!(user.bio)
|
||||||
Floki.attribute(user.bio, "a[rel~=me]", "href"))
|
|
||||||
|
(Floki.attribute(bio_tree, "link[rel~=me]", "href") ++
|
||||||
|
Floki.attribute(bio_tree, "a[rel~=me]", "href"))
|
||||||
|> Enum.map(fn link ->
|
|> Enum.map(fn link ->
|
||||||
{:link, [rel: "me", href: link], []}
|
{:link, [rel: "me", href: link], []}
|
||||||
end)
|
end)
|
||||||
|
|
|
@ -27,9 +27,10 @@ def parse(_), do: {:error, "No URL provided"}
|
||||||
defp parse_url(url) do
|
defp parse_url(url) do
|
||||||
with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <-
|
with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <-
|
||||||
Pleroma.HTTP.get(url, [], adapter: @hackney_options),
|
Pleroma.HTTP.get(url, [], adapter: @hackney_options),
|
||||||
|
{:ok, html_tree} <- Floki.parse_document(html),
|
||||||
data <-
|
data <-
|
||||||
Floki.attribute(html, "link[rel~=me]", "href") ++
|
Floki.attribute(html_tree, "link[rel~=me]", "href") ++
|
||||||
Floki.attribute(html, "a[rel~=me]", "href") do
|
Floki.attribute(html_tree, "a[rel~=me]", "href") do
|
||||||
{:ok, data}
|
{:ok, data}
|
||||||
end
|
end
|
||||||
rescue
|
rescue
|
||||||
|
|
|
@ -81,18 +81,18 @@ defp parse_url(url) do
|
||||||
{:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options)
|
{:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options)
|
||||||
|
|
||||||
html
|
html
|
||||||
|> parse_html
|
|> parse_html()
|
||||||
|> maybe_parse()
|
|> maybe_parse()
|
||||||
|> Map.put(:url, url)
|
|> Map.put(:url, url)
|
||||||
|> clean_parsed_data()
|
|> clean_parsed_data()
|
||||||
|> check_parsed_data()
|
|> check_parsed_data()
|
||||||
rescue
|
rescue
|
||||||
e ->
|
e ->
|
||||||
{:error, "Parsing error: #{inspect(e)}"}
|
{:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp parse_html(html), do: Floki.parse(html)
|
defp parse_html(html), do: Floki.parse_document!(html)
|
||||||
|
|
||||||
defp maybe_parse(html) do
|
defp maybe_parse(html) do
|
||||||
Enum.reduce_while(parsers(), %{}, fn parser, acc ->
|
Enum.reduce_while(parsers(), %{}, fn parser, acc ->
|
||||||
|
|
|
@ -7,11 +7,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
|
||||||
alias Pleroma.Web.RichMedia.Parsers.TwitterCard
|
alias Pleroma.Web.RichMedia.Parsers.TwitterCard
|
||||||
|
|
||||||
test "returns error when html not contains twitter card" do
|
test "returns error when html not contains twitter card" do
|
||||||
assert TwitterCard.parse("", %{}) == {:error, "No twitter card metadata found"}
|
assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) ==
|
||||||
|
{:error, "No twitter card metadata found"}
|
||||||
end
|
end
|
||||||
|
|
||||||
test "parses twitter card with only name attributes" do
|
test "parses twitter card with only name attributes" do
|
||||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
|
html =
|
||||||
|
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
|
||||||
|
|> Floki.parse_document!()
|
||||||
|
|
||||||
assert TwitterCard.parse(html, %{}) ==
|
assert TwitterCard.parse(html, %{}) ==
|
||||||
{:ok,
|
{:ok,
|
||||||
|
@ -26,7 +29,9 @@ test "parses twitter card with only name attributes" do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "parses twitter card with only property attributes" do
|
test "parses twitter card with only property attributes" do
|
||||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
|
html =
|
||||||
|
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
|
||||||
|
|> Floki.parse_document!()
|
||||||
|
|
||||||
assert TwitterCard.parse(html, %{}) ==
|
assert TwitterCard.parse(html, %{}) ==
|
||||||
{:ok,
|
{:ok,
|
||||||
|
@ -45,7 +50,9 @@ test "parses twitter card with only property attributes" do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "parses twitter card with name & property attributes" do
|
test "parses twitter card with name & property attributes" do
|
||||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
|
html =
|
||||||
|
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
|
||||||
|
|> Floki.parse_document!()
|
||||||
|
|
||||||
assert TwitterCard.parse(html, %{}) ==
|
assert TwitterCard.parse(html, %{}) ==
|
||||||
{:ok,
|
{:ok,
|
||||||
|
@ -73,7 +80,8 @@ test "respect only first title tag on the page" do
|
||||||
"YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <>
|
"YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <>
|
||||||
"yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg"
|
"yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg"
|
||||||
|
|
||||||
html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html")
|
html =
|
||||||
|
File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
|
||||||
|
|
||||||
assert TwitterCard.parse(html, %{}) ==
|
assert TwitterCard.parse(html, %{}) ==
|
||||||
{:ok,
|
{:ok,
|
||||||
|
@ -87,7 +95,9 @@ test "respect only first title tag on the page" do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "takes first founded title in html head if there is html markup error" do
|
test "takes first founded title in html head if there is html markup error" do
|
||||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
|
html =
|
||||||
|
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
|
||||||
|
|> Floki.parse_document!()
|
||||||
|
|
||||||
assert TwitterCard.parse(html, %{}) ==
|
assert TwitterCard.parse(html, %{}) ==
|
||||||
{:ok,
|
{:ok,
|
||||||
|
|
Loading…
Reference in a new issue