Proper implementation of N-Triples, N-Quads and Turtle escaping rules

This commit is contained in:
Marcel Otto 2017-07-09 22:21:07 +02:00
parent 7f5012dc0e
commit d1ef0d1fb3
4 changed files with 85 additions and 9 deletions

View file

@ -19,7 +19,8 @@ This project adheres to [Semantic Versioning](http://semver.org/) and
### Fixed
- booleans weren't recognized as convertible literals on object positions
- booleans weren't recognized as convertible literals on object positions
- N-Triples and N-Quads decoder didn't handle escaping properly

View file

@ -7,17 +7,17 @@ defmodule RDF.Serialization.ParseHelper do
def rdf_type, do: @rdf_type
def to_uri_string({:iriref, line, value}), do: value
def to_uri_string({:iriref, line, value}), do: value |> iri_unescape
def to_uri({:iriref, line, value}) do
case URI.parse(value) do
case URI.parse(iri_unescape(value)) do
%URI{scheme: nil} -> {:error, line, "#{value} is not a valid URI"}
parsed_uri -> {:ok, parsed_uri}
end
end
def to_absolute_or_relative_uri({:iriref, line, value}) do
case URI.parse(value) do
case URI.parse(iri_unescape(value)) do
uri = %URI{scheme: scheme} when not is_nil(scheme) -> uri
_ -> {:relative_uri, value}
end
@ -25,10 +25,10 @@ defmodule RDF.Serialization.ParseHelper do
def to_bnode({:blank_node_label, _line, value}), do: RDF.bnode(value)
def to_bnode({:anon, _line}), do: RDF.bnode # TODO:
def to_bnode({:anon, _line}), do: RDF.bnode
def to_literal({:string_literal_quote, _line, value}),
do: RDF.literal(value)
do: value |> string_unescape |> RDF.literal
def to_literal({:integer, _line, value}), do: RDF.literal(value)
def to_literal({:decimal, _line, value}), do: RDF.literal(value)
def to_literal({:double, _line, value}), do: RDF.literal(value)
@ -36,7 +36,7 @@ defmodule RDF.Serialization.ParseHelper do
def to_literal({:string_literal_quote, _line, value}, {:language, language}),
do: RDF.literal(value, language: language)
def to_literal({:string_literal_quote, _line, value}, {:datatype, %URI{} = type}),
do: RDF.literal(value, datatype: type)
do: value |> string_unescape |> RDF.literal(datatype: type)
def to_literal(string_literal_quote_ast, type),
do: {string_literal_quote_ast, type}
@ -58,4 +58,24 @@ defmodule RDF.Serialization.ParseHelper do
def prefix_ns(value), do: value |> List.to_string |> String.slice(0..-2)
def prefix_ln(value), do: value |> List.to_string |> String.split(":", parts: 2) |> List.to_tuple
def string_unescape(string),
do: string |> unescape_8digit_unicode_seq |> Macro.unescape_string(&string_unescape_map(&1))
def iri_unescape(string),
do: string |> unescape_8digit_unicode_seq |> Macro.unescape_string(&iri_unescape_map(&1))
defp string_unescape_map(?b), do: ?\b
defp string_unescape_map(?f), do: ?\f
defp string_unescape_map(?n), do: ?\n
defp string_unescape_map(?r), do: ?\r
defp string_unescape_map(?t), do: ?\t
defp string_unescape_map(?u), do: true
defp string_unescape_map(e), do: e
defp iri_unescape_map(?u), do: true
defp iri_unescape_map(e), do: e
def unescape_8digit_unicode_seq(string) do
String.replace(string, ~r/\\U([0-9]|[A-F]|[a-f]){2}(([0-9]|[A-F]|[a-f]){6})/, "\\u{\\2}")
end
end

View file

@ -93,7 +93,7 @@ defmodule RDF.Turtle.Decoder do
defp resolve_node({:prefix_ln, line_number, {prefix, name}}, statements, state) do
if ns = State.ns(state, prefix) do
{RDF.uri(ns <> name), statements, state}
{RDF.uri(ns <> local_name_unescape(name)), statements, state}
else
raise "line #{line_number}: undefined prefix #{inspect prefix}"
end
@ -107,7 +107,6 @@ defmodule RDF.Turtle.Decoder do
end
end
defp resolve_node({:relative_uri, relative_uri}, _, %State{base_uri: nil}) do
raise "Could not resolve resolve relative IRI '#{relative_uri}', no base uri provided"
end
@ -163,4 +162,12 @@ defmodule RDF.Turtle.Decoder do
defp resolve_node(node, statements, state), do: {node, statements, state}
defp local_name_unescape(string),
do: Macro.unescape_string(string, &local_name_unescape_map(&1))
@reserved_characters ~c[~.-!$&'()*+,;=/?#@%_]
defp local_name_unescape_map(e) when e in @reserved_characters, do: e
defp local_name_unescape_map(_), do: false
end

View file

@ -0,0 +1,48 @@
defmodule RDF.Serialization.ParseHelperTest do
use ExUnit.Case, async: false
alias RDF.Serialization.ParseHelper
@unicode_seq_4digit %{
~S"\u0020" => " ",
~S"<ab\u00E9xy>" => "<ab\xC3\xA9xy>",
~S"\u03B1:a" => "\xCE\xB1:a",
~S"a\u003Ab" => "a\x3Ab",
}
@unicode_seq_8digit %{
~S"\U00000020" => " ",
~S"\U00010000" => "\xF0\x90\x80\x80",
~S"\U000EFFFF" => "\xF3\xAF\xBF\xBF",
}
describe "string escaping" do
test "unescaping of \\uXXXX codepoint escape sequences" do
Enum.each @unicode_seq_4digit, fn {input, output} ->
assert ParseHelper.string_unescape(input) == output
end
end
test "unescaping of \\UXXXXXXXX codepoint escape sequences" do
Enum.each @unicode_seq_8digit, fn {input, output} ->
assert ParseHelper.string_unescape(input) == output
end
end
end
describe "IRI escaping" do
test "unescaping of \\uXXXX codepoint escape sequences" do
Enum.each @unicode_seq_4digit, fn {input, output} ->
assert ParseHelper.iri_unescape(input) == output
end
end
test "unescaping of \\UXXXXXXXX codepoint escape sequences" do
Enum.each @unicode_seq_8digit, fn {input, output} ->
assert ParseHelper.iri_unescape(input) == output
end
end
end
end