Proper implementation of N-Triples, N-Quads and Turtle escaping rules
This commit is contained in:
parent
7f5012dc0e
commit
d1ef0d1fb3
4 changed files with 85 additions and 9 deletions
|
@ -19,7 +19,8 @@ This project adheres to [Semantic Versioning](http://semver.org/) and
|
|||
|
||||
### Fixed
|
||||
|
||||
- booleans weren't recognized as convertible literals on object positions
|
||||
- booleans weren't recognized as convertible literals on object positions
|
||||
- N-Triples and N-Quads decoder didn't handle escaping properly
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -7,17 +7,17 @@ defmodule RDF.Serialization.ParseHelper do
|
|||
def rdf_type, do: @rdf_type
|
||||
|
||||
|
||||
def to_uri_string({:iriref, line, value}), do: value
|
||||
def to_uri_string({:iriref, line, value}), do: value |> iri_unescape
|
||||
|
||||
def to_uri({:iriref, line, value}) do
|
||||
case URI.parse(value) do
|
||||
case URI.parse(iri_unescape(value)) do
|
||||
%URI{scheme: nil} -> {:error, line, "#{value} is not a valid URI"}
|
||||
parsed_uri -> {:ok, parsed_uri}
|
||||
end
|
||||
end
|
||||
|
||||
def to_absolute_or_relative_uri({:iriref, line, value}) do
|
||||
case URI.parse(value) do
|
||||
case URI.parse(iri_unescape(value)) do
|
||||
uri = %URI{scheme: scheme} when not is_nil(scheme) -> uri
|
||||
_ -> {:relative_uri, value}
|
||||
end
|
||||
|
@ -25,10 +25,10 @@ defmodule RDF.Serialization.ParseHelper do
|
|||
|
||||
|
||||
def to_bnode({:blank_node_label, _line, value}), do: RDF.bnode(value)
|
||||
def to_bnode({:anon, _line}), do: RDF.bnode # TODO:
|
||||
def to_bnode({:anon, _line}), do: RDF.bnode
|
||||
|
||||
def to_literal({:string_literal_quote, _line, value}),
|
||||
do: RDF.literal(value)
|
||||
do: value |> string_unescape |> RDF.literal
|
||||
def to_literal({:integer, _line, value}), do: RDF.literal(value)
|
||||
def to_literal({:decimal, _line, value}), do: RDF.literal(value)
|
||||
def to_literal({:double, _line, value}), do: RDF.literal(value)
|
||||
|
@ -36,7 +36,7 @@ defmodule RDF.Serialization.ParseHelper do
|
|||
def to_literal({:string_literal_quote, _line, value}, {:language, language}),
|
||||
do: RDF.literal(value, language: language)
|
||||
def to_literal({:string_literal_quote, _line, value}, {:datatype, %URI{} = type}),
|
||||
do: RDF.literal(value, datatype: type)
|
||||
do: value |> string_unescape |> RDF.literal(datatype: type)
|
||||
def to_literal(string_literal_quote_ast, type),
|
||||
do: {string_literal_quote_ast, type}
|
||||
|
||||
|
@ -58,4 +58,24 @@ defmodule RDF.Serialization.ParseHelper do
|
|||
def prefix_ns(value), do: value |> List.to_string |> String.slice(0..-2)
|
||||
def prefix_ln(value), do: value |> List.to_string |> String.split(":", parts: 2) |> List.to_tuple
|
||||
|
||||
|
||||
def string_unescape(string),
|
||||
do: string |> unescape_8digit_unicode_seq |> Macro.unescape_string(&string_unescape_map(&1))
|
||||
def iri_unescape(string),
|
||||
do: string |> unescape_8digit_unicode_seq |> Macro.unescape_string(&iri_unescape_map(&1))
|
||||
|
||||
defp string_unescape_map(?b), do: ?\b
|
||||
defp string_unescape_map(?f), do: ?\f
|
||||
defp string_unescape_map(?n), do: ?\n
|
||||
defp string_unescape_map(?r), do: ?\r
|
||||
defp string_unescape_map(?t), do: ?\t
|
||||
defp string_unescape_map(?u), do: true
|
||||
defp string_unescape_map(e), do: e
|
||||
|
||||
defp iri_unescape_map(?u), do: true
|
||||
defp iri_unescape_map(e), do: e
|
||||
|
||||
def unescape_8digit_unicode_seq(string) do
|
||||
String.replace(string, ~r/\\U([0-9]|[A-F]|[a-f]){2}(([0-9]|[A-F]|[a-f]){6})/, "\\u{\\2}")
|
||||
end
|
||||
end
|
||||
|
|
|
@ -93,7 +93,7 @@ defmodule RDF.Turtle.Decoder do
|
|||
|
||||
defp resolve_node({:prefix_ln, line_number, {prefix, name}}, statements, state) do
|
||||
if ns = State.ns(state, prefix) do
|
||||
{RDF.uri(ns <> name), statements, state}
|
||||
{RDF.uri(ns <> local_name_unescape(name)), statements, state}
|
||||
else
|
||||
raise "line #{line_number}: undefined prefix #{inspect prefix}"
|
||||
end
|
||||
|
@ -107,7 +107,6 @@ defmodule RDF.Turtle.Decoder do
|
|||
end
|
||||
end
|
||||
|
||||
|
||||
defp resolve_node({:relative_uri, relative_uri}, _, %State{base_uri: nil}) do
|
||||
raise "Could not resolve resolve relative IRI '#{relative_uri}', no base uri provided"
|
||||
end
|
||||
|
@ -163,4 +162,12 @@ defmodule RDF.Turtle.Decoder do
|
|||
|
||||
defp resolve_node(node, statements, state), do: {node, statements, state}
|
||||
|
||||
defp local_name_unescape(string),
|
||||
do: Macro.unescape_string(string, &local_name_unescape_map(&1))
|
||||
|
||||
@reserved_characters ~c[~.-!$&'()*+,;=/?#@%_]
|
||||
|
||||
defp local_name_unescape_map(e) when e in @reserved_characters, do: e
|
||||
defp local_name_unescape_map(_), do: false
|
||||
|
||||
end
|
||||
|
|
48
test/unit/serialization/parse_helper_test.exs
Normal file
48
test/unit/serialization/parse_helper_test.exs
Normal file
|
@ -0,0 +1,48 @@
|
|||
defmodule RDF.Serialization.ParseHelperTest do
|
||||
use ExUnit.Case, async: false
|
||||
|
||||
alias RDF.Serialization.ParseHelper
|
||||
|
||||
@unicode_seq_4digit %{
|
||||
~S"\u0020" => " ",
|
||||
~S"<ab\u00E9xy>" => "<ab\xC3\xA9xy>",
|
||||
~S"\u03B1:a" => "\xCE\xB1:a",
|
||||
~S"a\u003Ab" => "a\x3Ab",
|
||||
}
|
||||
|
||||
@unicode_seq_8digit %{
|
||||
~S"\U00000020" => " ",
|
||||
~S"\U00010000" => "\xF0\x90\x80\x80",
|
||||
~S"\U000EFFFF" => "\xF3\xAF\xBF\xBF",
|
||||
}
|
||||
|
||||
|
||||
describe "string escaping" do
|
||||
test "unescaping of \\uXXXX codepoint escape sequences" do
|
||||
Enum.each @unicode_seq_4digit, fn {input, output} ->
|
||||
assert ParseHelper.string_unescape(input) == output
|
||||
end
|
||||
end
|
||||
|
||||
test "unescaping of \\UXXXXXXXX codepoint escape sequences" do
|
||||
Enum.each @unicode_seq_8digit, fn {input, output} ->
|
||||
assert ParseHelper.string_unescape(input) == output
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "IRI escaping" do
|
||||
test "unescaping of \\uXXXX codepoint escape sequences" do
|
||||
Enum.each @unicode_seq_4digit, fn {input, output} ->
|
||||
assert ParseHelper.iri_unescape(input) == output
|
||||
end
|
||||
end
|
||||
|
||||
test "unescaping of \\UXXXXXXXX codepoint escape sequences" do
|
||||
Enum.each @unicode_seq_8digit, fn {input, output} ->
|
||||
assert ParseHelper.iri_unescape(input) == output
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
Loading…
Reference in a new issue