Merge OGP parser with TwitterCard

This commit is contained in:
Egor Kislitsyn 2020-06-11 17:57:31 +04:00
parent 7aa6c82937
commit 1f35acce54
No known key found for this signature in database
GPG key ID: 1B49CB15B71E7805
9 changed files with 90 additions and 93 deletions

View file

@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [unreleased] ## [unreleased]
### Changed ### Changed
- OGP rich media parser merged with TwitterCard
<details> <details>
<summary>API Changes</summary> <summary>API Changes</summary>
- **Breaking:** Emoji API: changed methods and renamed routes. - **Breaking:** Emoji API: changed methods and renamed routes.

View file

@ -385,7 +385,6 @@
ignore_tld: ["local", "localdomain", "lan"], ignore_tld: ["local", "localdomain", "lan"],
parsers: [ parsers: [
Pleroma.Web.RichMedia.Parsers.TwitterCard, Pleroma.Web.RichMedia.Parsers.TwitterCard,
Pleroma.Web.RichMedia.Parsers.OGP,
Pleroma.Web.RichMedia.Parsers.OEmbed Pleroma.Web.RichMedia.Parsers.OEmbed
], ],
ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl] ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl]

View file

@ -2091,9 +2091,7 @@
description: description:
"List of Rich Media parsers. Module names are shortened (removed leading `Pleroma.Web.RichMedia.Parsers.` part), but on adding custom module you need to use full name.", "List of Rich Media parsers. Module names are shortened (removed leading `Pleroma.Web.RichMedia.Parsers.` part), but on adding custom module you need to use full name.",
suggestions: [ suggestions: [
Pleroma.Web.RichMedia.Parsers.MetaTagsParser,
Pleroma.Web.RichMedia.Parsers.OEmbed, Pleroma.Web.RichMedia.Parsers.OEmbed,
Pleroma.Web.RichMedia.Parsers.OGP,
Pleroma.Web.RichMedia.Parsers.TwitterCard Pleroma.Web.RichMedia.Parsers.TwitterCard
] ]
}, },

View file

@ -105,8 +105,8 @@ defp parse_html(html), do: Floki.parse_document!(html)
defp maybe_parse(html) do defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc -> Enum.reduce_while(parsers(), %{}, fn parser, acc ->
case parser.parse(html, acc) do case parser.parse(html, acc) do
{:ok, data} -> {:halt, data} data when data != %{} -> {:halt, data}
{:error, _msg} -> {:cont, acc} _ -> {:cont, acc}
end end
end) end)
end end

View file

@ -3,22 +3,15 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do
def parse(html, data, prefix, error_message, key_name, value_name \\ "content") do def parse(data, html, prefix, key_name, value_name \\ "content") do
meta_data = html
html |> get_elements(key_name, prefix)
|> get_elements(key_name, prefix) |> Enum.reduce(data, fn el, acc ->
|> Enum.reduce(data, fn el, acc -> attributes = normalize_attributes(el, prefix, key_name, value_name)
attributes = normalize_attributes(el, prefix, key_name, value_name)
Map.merge(acc, attributes) Map.merge(acc, attributes)
end) end)
|> maybe_put_title(html) |> maybe_put_title(html)
if Enum.empty?(meta_data) do
{:error, error_message}
else
{:ok, meta_data}
end
end end
defp get_elements(html, key_name, prefix) do defp get_elements(html, key_name, prefix) do

View file

@ -7,9 +7,9 @@ def parse(html, _data) do
with elements = [_ | _] <- get_discovery_data(html), with elements = [_ | _] <- get_discovery_data(html),
{:ok, oembed_url} <- get_oembed_url(elements), {:ok, oembed_url} <- get_oembed_url(elements),
{:ok, oembed_data} <- get_oembed_data(oembed_url) do {:ok, oembed_data} <- get_oembed_data(oembed_url) do
{:ok, oembed_data} oembed_data
else else
_e -> {:error, "No OEmbed data found"} _e -> %{}
end end
end end

View file

@ -5,10 +5,9 @@
defmodule Pleroma.Web.RichMedia.Parsers.OGP do defmodule Pleroma.Web.RichMedia.Parsers.OGP do
def parse(html, data) do def parse(html, data) do
Pleroma.Web.RichMedia.Parsers.MetaTagsParser.parse( Pleroma.Web.RichMedia.Parsers.MetaTagsParser.parse(
html,
data, data,
html,
"og", "og",
"No OGP metadata found",
"property" "property"
) )
end end

View file

@ -5,18 +5,11 @@
defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do
alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser
@spec parse(String.t(), map()) :: {:ok, map()} | {:error, String.t()} @spec parse(list(), map()) :: map()
def parse(html, data) do def parse(html, data) do
data data
|> parse_name_attrs(html) |> MetaTagsParser.parse(html, "og", "property")
|> parse_property_attrs(html) |> MetaTagsParser.parse(html, "twitter", "name")
end |> MetaTagsParser.parse(html, "twitter", "property")
defp parse_name_attrs(data, html) do
MetaTagsParser.parse(html, data, "twitter", %{}, "name")
end
defp parse_property_attrs({_, data}, html) do
MetaTagsParser.parse(html, data, "twitter", "No twitter card metadata found", "property")
end end
end end

View file

@ -7,8 +7,7 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
alias Pleroma.Web.RichMedia.Parsers.TwitterCard alias Pleroma.Web.RichMedia.Parsers.TwitterCard
test "returns error when html not contains twitter card" do test "returns error when html not contains twitter card" do
assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == %{}
{:error, "No twitter card metadata found"}
end end
test "parses twitter card with only name attributes" do test "parses twitter card with only name attributes" do
@ -17,15 +16,21 @@ test "parses twitter card with only name attributes" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, %{
%{ "app:id:googleplay": "com.nytimes.android",
"app:id:googleplay": "com.nytimes.android", "app:name:googleplay": "NYTimes",
"app:name:googleplay": "NYTimes", "app:url:googleplay": "nytimes://reader/id/100000006583622",
"app:url:googleplay": "nytimes://reader/id/100000006583622", site: nil,
site: nil, description:
title: "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times" image:
}} "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg",
type: "article",
url:
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html",
title:
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database."
}
end end
test "parses twitter card with only property attributes" do test "parses twitter card with only property attributes" do
@ -34,19 +39,19 @@ test "parses twitter card with only property attributes" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, %{
%{ card: "summary_large_image",
card: "summary_large_image", description:
description: "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
"With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", image:
image: "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg",
"https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", "image:alt": "",
"image:alt": "", title:
title: "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", url:
url: "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html",
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" type: "article"
}} }
end end
test "parses twitter card with name & property attributes" do test "parses twitter card with name & property attributes" do
@ -55,23 +60,23 @@ test "parses twitter card with name & property attributes" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, %{
%{ "app:id:googleplay": "com.nytimes.android",
"app:id:googleplay": "com.nytimes.android", "app:name:googleplay": "NYTimes",
"app:name:googleplay": "NYTimes", "app:url:googleplay": "nytimes://reader/id/100000006583622",
"app:url:googleplay": "nytimes://reader/id/100000006583622", card: "summary_large_image",
card: "summary_large_image", description:
description: "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
"With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", image:
image: "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg",
"https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", "image:alt": "",
"image:alt": "", site: nil,
site: nil, title:
title: "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", url:
url: "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html",
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" type: "article"
}} }
end end
test "respect only first title tag on the page" do test "respect only first title tag on the page" do
@ -84,14 +89,17 @@ test "respect only first title tag on the page" do
File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!() File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, %{
%{ site: "@atlasobscura",
site: "@atlasobscura", title: "The Missing Grave of Margaret Corbin, Revolutionary War Veteran",
title: card: "summary_large_image",
"The Missing Grave of Margaret Corbin, Revolutionary War Veteran - Atlas Obscura", image: image_path,
card: "summary_large_image", description:
image: image_path "She's the only woman veteran honored with a monument at West Point. But where was she buried?",
}} site_name: "Atlas Obscura",
type: "article",
url: "http://www.atlasobscura.com/articles/margaret-corbin-grave-west-point"
}
end end
test "takes first founded title in html head if there is html markup error" do test "takes first founded title in html head if there is html markup error" do
@ -100,14 +108,20 @@ test "takes first founded title in html head if there is html markup error" do
|> Floki.parse_document!() |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, %{
%{ site: nil,
site: nil, title:
title: "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.",
"She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", "app:id:googleplay": "com.nytimes.android",
"app:id:googleplay": "com.nytimes.android", "app:name:googleplay": "NYTimes",
"app:name:googleplay": "NYTimes", "app:url:googleplay": "nytimes://reader/id/100000006583622",
"app:url:googleplay": "nytimes://reader/id/100000006583622" description:
}} "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.",
image:
"https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg",
type: "article",
url:
"https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html"
}
end end
end end