Fix atom leak in Rich Media Parser

This commit is contained in:
Egor Kislitsyn 2020-06-09 21:49:24 +04:00 committed by rinpatch
parent e2793744c5
commit 520367d6fd
7 changed files with 91 additions and 102 deletions

View file

@ -377,8 +377,8 @@ def render("card.json", %{rich_media: rich_media, page_url: page_url}) do
page_url_data = URI.parse(page_url) page_url_data = URI.parse(page_url)
page_url_data = page_url_data =
if rich_media[:url] != nil do if is_binary(rich_media["url"]) do
URI.merge(page_url_data, URI.parse(rich_media[:url])) URI.merge(page_url_data, URI.parse(rich_media["url"]))
else else
page_url_data page_url_data
end end
@ -386,11 +386,9 @@ def render("card.json", %{rich_media: rich_media, page_url: page_url}) do
page_url = page_url_data |> to_string page_url = page_url_data |> to_string
image_url = image_url =
if rich_media[:image] != nil do if is_binary(rich_media["image"]) do
URI.merge(page_url_data, URI.parse(rich_media[:image])) URI.merge(page_url_data, URI.parse(rich_media["image"]))
|> to_string |> to_string
else
nil
end end
%{ %{
@ -399,8 +397,8 @@ def render("card.json", %{rich_media: rich_media, page_url: page_url}) do
provider_url: page_url_data.scheme <> "://" <> page_url_data.host, provider_url: page_url_data.scheme <> "://" <> page_url_data.host,
url: page_url, url: page_url,
image: image_url |> MediaProxy.url(), image: image_url |> MediaProxy.url(),
title: rich_media[:title] || "", title: rich_media["title"] || "",
description: rich_media[:description] || "", description: rich_media["description"] || "",
pleroma: %{ pleroma: %{
opengraph: rich_media opengraph: rich_media
} }

View file

@ -9,7 +9,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
alias Pleroma.Object alias Pleroma.Object
alias Pleroma.Web.RichMedia.Parser alias Pleroma.Web.RichMedia.Parser
@spec validate_page_url(any()) :: :ok | :error @spec validate_page_url(URI.t() | binary()) :: :ok | :error
defp validate_page_url(page_url) when is_binary(page_url) do defp validate_page_url(page_url) when is_binary(page_url) do
validate_tld = Application.get_env(:auto_linker, :opts)[:validate_tld] validate_tld = Application.get_env(:auto_linker, :opts)[:validate_tld]
@ -18,8 +18,8 @@ defp validate_page_url(page_url) when is_binary(page_url) do
|> parse_uri(page_url) |> parse_uri(page_url)
end end
defp validate_page_url(%URI{host: host, scheme: scheme, authority: authority}) defp validate_page_url(%URI{host: host, scheme: "https", authority: authority})
when scheme == "https" and not is_nil(authority) do when is_binary(authority) do
cond do cond do
host in Config.get([:rich_media, :ignore_hosts], []) -> host in Config.get([:rich_media, :ignore_hosts], []) ->
:error :error

View file

@ -91,7 +91,7 @@ defp parse_url(url) do
html html
|> parse_html() |> parse_html()
|> maybe_parse() |> maybe_parse()
|> Map.put(:url, url) |> Map.put("url", url)
|> clean_parsed_data() |> clean_parsed_data()
|> check_parsed_data() |> check_parsed_data()
rescue rescue
@ -111,8 +111,8 @@ defp maybe_parse(html) do
end) end)
end end
defp check_parsed_data(%{title: title} = data) defp check_parsed_data(%{"title" => title} = data)
when is_binary(title) and byte_size(title) > 0 do when is_binary(title) and title != "" do
{:ok, data} {:ok, data}
end end
@ -123,11 +123,7 @@ defp check_parsed_data(data) do
defp clean_parsed_data(data) do defp clean_parsed_data(data) do
data data
|> Enum.reject(fn {key, val} -> |> Enum.reject(fn {key, val} ->
with {:ok, _} <- Jason.encode(%{key => val}) do not match?({:ok, _}, Jason.encode(%{key => val}))
false
else
_ -> true
end
end) end)
|> Map.new() |> Map.new()
end end

View file

@ -29,19 +29,19 @@ defp normalize_attributes(html_node, prefix, key_name, value_name) do
{_tag, attributes, _children} = html_node {_tag, attributes, _children} = html_node
data = data =
Enum.into(attributes, %{}, fn {name, value} -> Map.new(attributes, fn {name, value} ->
{name, String.trim_leading(value, "#{prefix}:")} {name, String.trim_leading(value, "#{prefix}:")}
end) end)
%{String.to_atom(data[key_name]) => data[value_name]} %{data[key_name] => data[value_name]}
end end
defp maybe_put_title(%{title: _} = meta, _), do: meta defp maybe_put_title(%{"title" => _} = meta, _), do: meta
defp maybe_put_title(meta, html) when meta != %{} do defp maybe_put_title(meta, html) when meta != %{} do
case get_page_title(html) do case get_page_title(html) do
"" -> meta "" -> meta
title -> Map.put_new(meta, :title, title) title -> Map.put_new(meta, "title", title)
end end
end end

View file

@ -5,7 +5,7 @@
defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do
def parse(html, _data) do def parse(html, _data) do
with elements = [_ | _] <- get_discovery_data(html), with elements = [_ | _] <- get_discovery_data(html),
{:ok, oembed_url} <- get_oembed_url(elements), oembed_url when is_binary(oembed_url) <- get_oembed_url(elements),
{:ok, oembed_data} <- get_oembed_data(oembed_url) do {:ok, oembed_data} <- get_oembed_data(oembed_url) do
{:ok, oembed_data} {:ok, oembed_data}
else else
@ -17,19 +17,13 @@ defp get_discovery_data(html) do
html |> Floki.find("link[type='application/json+oembed']") html |> Floki.find("link[type='application/json+oembed']")
end end
defp get_oembed_url(nodes) do defp get_oembed_url([{"link", attributes, _children} | _]) do
{"link", attributes, _children} = nodes |> hd() Enum.find_value(attributes, fn {k, v} -> if k == "href", do: v end)
{:ok, Enum.into(attributes, %{})["href"]}
end end
defp get_oembed_data(url) do defp get_oembed_data(url) do
{:ok, %Tesla.Env{body: json}} = Pleroma.HTTP.get(url, [], adapter: [pool: :media]) with {:ok, %Tesla.Env{body: json}} <- Pleroma.HTTP.get(url, [], adapter: [pool: :media]) do
Jason.decode(json)
{:ok, data} = Jason.decode(json) end
data = data |> Map.new(fn {k, v} -> {String.to_atom(k), v} end)
{:ok, data}
end end
end end

View file

@ -60,19 +60,19 @@ test "returns error when no metadata present" do
test "doesn't just add a title" do test "doesn't just add a title" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/non-ogp") == assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/non-ogp") ==
{:error, {:error,
"Found metadata was invalid or incomplete: %{url: \"http://example.com/non-ogp\"}"} "Found metadata was invalid or incomplete: %{\"url\" => \"http://example.com/non-ogp\"}"}
end end
test "parses ogp" do test "parses ogp" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp") == assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp") ==
{:ok, {:ok,
%{ %{
image: "http://ia.media-imdb.com/images/rock.jpg", "image" => "http://ia.media-imdb.com/images/rock.jpg",
title: "The Rock", "title" => "The Rock",
description: "description" =>
"Directed by Michael Bay. With Sean Connery, Nicolas Cage, Ed Harris, John Spencer.", "Directed by Michael Bay. With Sean Connery, Nicolas Cage, Ed Harris, John Spencer.",
type: "video.movie", "type" => "video.movie",
url: "http://example.com/ogp" "url" => "http://example.com/ogp"
}} }}
end end
@ -80,12 +80,12 @@ test "falls back to <title> when ogp:title is missing" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp-missing-title") == assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp-missing-title") ==
{:ok, {:ok,
%{ %{
image: "http://ia.media-imdb.com/images/rock.jpg", "image" => "http://ia.media-imdb.com/images/rock.jpg",
title: "The Rock (1996)", "title" => "The Rock (1996)",
description: "description" =>
"Directed by Michael Bay. With Sean Connery, Nicolas Cage, Ed Harris, John Spencer.", "Directed by Michael Bay. With Sean Connery, Nicolas Cage, Ed Harris, John Spencer.",
type: "video.movie", "type" => "video.movie",
url: "http://example.com/ogp-missing-title" "url" => "http://example.com/ogp-missing-title"
}} }}
end end
@ -93,12 +93,12 @@ test "parses twitter card" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/twitter-card") == assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/twitter-card") ==
{:ok, {:ok,
%{ %{
card: "summary", "card" => "summary",
site: "@flickr", "site" => "@flickr",
image: "https://farm6.staticflickr.com/5510/14338202952_93595258ff_z.jpg", "image" => "https://farm6.staticflickr.com/5510/14338202952_93595258ff_z.jpg",
title: "Small Island Developing States Photo Submission", "title" => "Small Island Developing States Photo Submission",
description: "View the album on Flickr.", "description" => "View the album on Flickr.",
url: "http://example.com/twitter-card" "url" => "http://example.com/twitter-card"
}} }}
end end
@ -106,27 +106,28 @@ test "parses OEmbed" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/oembed") == assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/oembed") ==
{:ok, {:ok,
%{ %{
author_name: "bees", "author_name" => "bees",
author_url: "https://www.flickr.com/photos/bees/", "author_url" => "https://www.flickr.com/photos/bees/",
cache_age: 3600, "cache_age" => 3600,
flickr_type: "photo", "flickr_type" => "photo",
height: "768", "height" => "768",
html: "html" =>
"<a data-flickr-embed=\"true\" href=\"https://www.flickr.com/photos/bees/2362225867/\" title=\"Bacon Lollys by bees, on Flickr\"><img src=\"https://farm4.staticflickr.com/3040/2362225867_4a87ab8baf_b.jpg\" width=\"1024\" height=\"768\" alt=\"Bacon Lollys\"></a><script async src=\"https://embedr.flickr.com/assets/client-code.js\" charset=\"utf-8\"></script>", "<a data-flickr-embed=\"true\" href=\"https://www.flickr.com/photos/bees/2362225867/\" title=\"Bacon Lollys by bees, on Flickr\"><img src=\"https://farm4.staticflickr.com/3040/2362225867_4a87ab8baf_b.jpg\" width=\"1024\" height=\"768\" alt=\"Bacon Lollys\"></a><script async src=\"https://embedr.flickr.com/assets/client-code.js\" charset=\"utf-8\"></script>",
license: "All Rights Reserved", "license" => "All Rights Reserved",
license_id: 0, "license_id" => 0,
provider_name: "Flickr", "provider_name" => "Flickr",
provider_url: "https://www.flickr.com/", "provider_url" => "https://www.flickr.com/",
thumbnail_height: 150, "thumbnail_height" => 150,
thumbnail_url: "https://farm4.staticflickr.com/3040/2362225867_4a87ab8baf_q.jpg", "thumbnail_url" =>
thumbnail_width: 150, "https://farm4.staticflickr.com/3040/2362225867_4a87ab8baf_q.jpg",
title: "Bacon Lollys", "thumbnail_width" => 150,
type: "photo", "title" => "Bacon Lollys",
url: "http://example.com/oembed", "type" => "photo",
version: "1.0", "url" => "http://example.com/oembed",
web_page: "https://www.flickr.com/photos/bees/2362225867/", "version" => "1.0",
web_page_short_url: "https://flic.kr/p/4AK2sc", "web_page" => "https://www.flickr.com/photos/bees/2362225867/",
width: "1024" "web_page_short_url" => "https://flic.kr/p/4AK2sc",
"width" => "1024"
}} }}
end end

View file

@ -19,11 +19,11 @@ test "parses twitter card with only name attributes" do
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
%{ %{
"app:id:googleplay": "com.nytimes.android", "app:id:googleplay" => "com.nytimes.android",
"app:name:googleplay": "NYTimes", "app:name:googleplay" => "NYTimes",
"app:url:googleplay": "nytimes://reader/id/100000006583622", "app:url:googleplay" => "nytimes://reader/id/100000006583622",
site: nil, "site" => nil,
title: "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times" "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times"
}} }}
end end
@ -36,15 +36,15 @@ test "parses twitter card with only property attributes" do
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
%{ %{
card: "summary_large_image", "card" => "summary_large_image",
description: "description" =>
"With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
image: "image" =>
"https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg",
"image:alt": "", "image:alt" => "",
title: "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
url: "url" =>
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html"
}} }}
end end
@ -57,19 +57,19 @@ test "parses twitter card with name & property attributes" do
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
%{ %{
"app:id:googleplay": "com.nytimes.android", "app:id:googleplay" => "com.nytimes.android",
"app:name:googleplay": "NYTimes", "app:name:googleplay" => "NYTimes",
"app:url:googleplay": "nytimes://reader/id/100000006583622", "app:url:googleplay" => "nytimes://reader/id/100000006583622",
card: "summary_large_image", "card" => "summary_large_image",
description: "description" =>
"With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
image: "image" =>
"https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg",
"image:alt": "", "image:alt" => "",
site: nil, "site" => nil,
title: "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
url: "url" =>
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html"
}} }}
end end
@ -86,11 +86,11 @@ test "respect only first title tag on the page" do
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
%{ %{
site: "@atlasobscura", "site" => "@atlasobscura",
title: "title" =>
"The Missing Grave of Margaret Corbin, Revolutionary War Veteran - Atlas Obscura", "The Missing Grave of Margaret Corbin, Revolutionary War Veteran - Atlas Obscura",
card: "summary_large_image", "card" => "summary_large_image",
image: image_path "image" => image_path
}} }}
end end
@ -102,12 +102,12 @@ test "takes first founded title in html head if there is html markup error" do
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
%{ %{
site: nil, "site" => nil,
title: "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times",
"app:id:googleplay": "com.nytimes.android", "app:id:googleplay" => "com.nytimes.android",
"app:name:googleplay": "NYTimes", "app:name:googleplay" => "NYTimes",
"app:url:googleplay": "nytimes://reader/id/100000006583622" "app:url:googleplay" => "nytimes://reader/id/100000006583622"
}} }}
end end
end end