Merge branch 'merge-ogp-twitter-parsers' into 'develop'

Merge OGP parser with TwitterCard

Closes #1835

See merge request pleroma/pleroma!2642
This commit is contained in:
lain 2020-06-15 12:41:48 +00:00
commit 1e49bfa9ac
9 changed files with 92 additions and 99 deletions

View file

@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed ### Changed
- MFR policy to set global expiration for all local Create activities - MFR policy to set global expiration for all local Create activities
- OGP rich media parser merged with TwitterCard
<details> <details>
<summary>API Changes</summary> <summary>API Changes</summary>
- **Breaking:** Emoji API: changed methods and renamed routes. - **Breaking:** Emoji API: changed methods and renamed routes.

View file

@ -387,7 +387,6 @@
ignore_tld: ["local", "localdomain", "lan"], ignore_tld: ["local", "localdomain", "lan"],
parsers: [ parsers: [
Pleroma.Web.RichMedia.Parsers.TwitterCard, Pleroma.Web.RichMedia.Parsers.TwitterCard,
Pleroma.Web.RichMedia.Parsers.OGP,
Pleroma.Web.RichMedia.Parsers.OEmbed Pleroma.Web.RichMedia.Parsers.OEmbed
], ],
ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl] ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl]

View file

@ -2104,9 +2104,7 @@
description: description:
"List of Rich Media parsers. Module names are shortened (removed leading `Pleroma.Web.RichMedia.Parsers.` part), but on adding custom module you need to use full name.", "List of Rich Media parsers. Module names are shortened (removed leading `Pleroma.Web.RichMedia.Parsers.` part), but on adding custom module you need to use full name.",
suggestions: [ suggestions: [
Pleroma.Web.RichMedia.Parsers.MetaTagsParser,
Pleroma.Web.RichMedia.Parsers.OEmbed, Pleroma.Web.RichMedia.Parsers.OEmbed,
Pleroma.Web.RichMedia.Parsers.OGP,
Pleroma.Web.RichMedia.Parsers.TwitterCard Pleroma.Web.RichMedia.Parsers.TwitterCard
] ]
}, },

View file

@ -105,8 +105,8 @@ defp parse_html(html), do: Floki.parse_document!(html)
defp maybe_parse(html) do defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc -> Enum.reduce_while(parsers(), %{}, fn parser, acc ->
case parser.parse(html, acc) do case parser.parse(html, acc) do
{:ok, data} -> {:halt, data} data when data != %{} -> {:halt, data}
{:error, _msg} -> {:cont, acc} _ -> {:cont, acc}
end end
end) end)
end end

View file

@ -3,8 +3,7 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do
def parse(html, data, prefix, error_message, key_name, value_name \\ "content") do def parse(data, html, prefix, key_name, value_name \\ "content") do
meta_data =
html html
|> get_elements(key_name, prefix) |> get_elements(key_name, prefix)
|> Enum.reduce(data, fn el, acc -> |> Enum.reduce(data, fn el, acc ->
@ -13,12 +12,6 @@ def parse(html, data, prefix, error_message, key_name, value_name \\ "content")
Map.merge(acc, attributes) Map.merge(acc, attributes)
end) end)
|> maybe_put_title(html) |> maybe_put_title(html)
if Enum.empty?(meta_data) do
{:error, error_message}
else
{:ok, meta_data}
end
end end
defp get_elements(html, key_name, prefix) do defp get_elements(html, key_name, prefix) do

View file

@ -7,9 +7,9 @@ def parse(html, _data) do
with elements = [_ | _] <- get_discovery_data(html), with elements = [_ | _] <- get_discovery_data(html),
oembed_url when is_binary(oembed_url) <- get_oembed_url(elements), oembed_url when is_binary(oembed_url) <- get_oembed_url(elements),
{:ok, oembed_data} <- get_oembed_data(oembed_url) do {:ok, oembed_data} <- get_oembed_data(oembed_url) do
{:ok, oembed_data} oembed_data
else else
_e -> {:error, "No OEmbed data found"} _e -> %{}
end end
end end

View file

@ -3,13 +3,8 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parsers.OGP do defmodule Pleroma.Web.RichMedia.Parsers.OGP do
def parse(html, data) do @deprecated "OGP parser is deprecated. Use TwitterCard instead."
Pleroma.Web.RichMedia.Parsers.MetaTagsParser.parse( def parse(_html, _data) do
html, %{}
data,
"og",
"No OGP metadata found",
"property"
)
end end
end end

View file

@ -5,18 +5,11 @@
defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do
alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser
@spec parse(String.t(), map()) :: {:ok, map()} | {:error, String.t()} @spec parse(list(), map()) :: map()
def parse(html, data) do def parse(html, data) do
data data
|> parse_name_attrs(html) |> MetaTagsParser.parse(html, "og", "property")
|> parse_property_attrs(html) |> MetaTagsParser.parse(html, "twitter", "name")
end |> MetaTagsParser.parse(html, "twitter", "property")
defp parse_name_attrs(data, html) do
MetaTagsParser.parse(html, data, "twitter", %{}, "name")
end
defp parse_property_attrs({_, data}, html) do
MetaTagsParser.parse(html, data, "twitter", "No twitter card metadata found", "property")
end end
end end

View file

@ -7,8 +7,7 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
alias Pleroma.Web.RichMedia.Parsers.TwitterCard alias Pleroma.Web.RichMedia.Parsers.TwitterCard
test "returns error when html not contains twitter card" do test "returns error when html not contains twitter card" do
assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == %{}
{:error, "No twitter card metadata found"}
end end
test "parses twitter card with only name attributes" do test "parses twitter card with only name attributes" do
@ -17,15 +16,21 @@ test "parses twitter card with only name attributes" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok,
%{ %{
"app:id:googleplay" => "com.nytimes.android", "app:id:googleplay" => "com.nytimes.android",
"app:name:googleplay" => "NYTimes", "app:name:googleplay" => "NYTimes",
"app:url:googleplay" => "nytimes://reader/id/100000006583622", "app:url:googleplay" => "nytimes://reader/id/100000006583622",
"site" => nil, "site" => nil,
"description" =>
"With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
"image" =>
"https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg",
"type" => "article",
"url" =>
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html",
"title" => "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times" "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database."
}} }
end end
test "parses twitter card with only property attributes" do test "parses twitter card with only property attributes" do
@ -34,7 +39,6 @@ test "parses twitter card with only property attributes" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok,
%{ %{
"card" => "summary_large_image", "card" => "summary_large_image",
"description" => "description" =>
@ -45,8 +49,9 @@ test "parses twitter card with only property attributes" do
"title" => "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
"url" => "url" =>
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html",
}} "type" => "article"
}
end end
test "parses twitter card with name & property attributes" do test "parses twitter card with name & property attributes" do
@ -55,7 +60,6 @@ test "parses twitter card with name & property attributes" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok,
%{ %{
"app:id:googleplay" => "com.nytimes.android", "app:id:googleplay" => "com.nytimes.android",
"app:name:googleplay" => "NYTimes", "app:name:googleplay" => "NYTimes",
@ -70,8 +74,9 @@ test "parses twitter card with name & property attributes" do
"title" => "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
"url" => "url" =>
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html",
}} "type" => "article"
}
end end
test "respect only first title tag on the page" do test "respect only first title tag on the page" do
@ -84,14 +89,17 @@ test "respect only first title tag on the page" do
File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!() File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok,
%{ %{
"site" => "@atlasobscura", "site" => "@atlasobscura",
"title" => "title" => "The Missing Grave of Margaret Corbin, Revolutionary War Veteran",
"The Missing Grave of Margaret Corbin, Revolutionary War Veteran - Atlas Obscura",
"card" => "summary_large_image", "card" => "summary_large_image",
"image" => image_path "image" => image_path,
}} "description" =>
"She's the only woman veteran honored with a monument at West Point. But where was she buried?",
"site_name" => "Atlas Obscura",
"type" => "article",
"url" => "http://www.atlasobscura.com/articles/margaret-corbin-grave-west-point"
}
end end
test "takes first founded title in html head if there is html markup error" do test "takes first founded title in html head if there is html markup error" do
@ -100,14 +108,20 @@ test "takes first founded title in html head if there is html markup error" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok,
%{ %{
"site" => nil, "site" => nil,
"title" => "title" =>
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
"app:id:googleplay" => "com.nytimes.android", "app:id:googleplay" => "com.nytimes.android",
"app:name:googleplay" => "NYTimes", "app:name:googleplay" => "NYTimes",
"app:url:googleplay" => "nytimes://reader/id/100000006583622" "app:url:googleplay" => "nytimes://reader/id/100000006583622",
}} "description" =>
"With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
"image" =>
"https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg",
"type" => "article",
"url" =>
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html"
}
end end
end end