From d1ef0d1fb3599138d3bd3f936439d53950eddfc1 Mon Sep 17 00:00:00 2001 From: Marcel Otto Date: Sun, 9 Jul 2017 22:21:07 +0200 Subject: [PATCH] Proper implementation of N-Triples, N-Quads and Turtle escaping rules --- CHANGELOG.md | 3 +- lib/rdf/serialization/parse_helper.ex | 32 ++++++++++--- lib/rdf/serializations/turtle_decoder.ex | 11 ++++- test/unit/serialization/parse_helper_test.exs | 48 +++++++++++++++++++ 4 files changed, 85 insertions(+), 9 deletions(-) create mode 100644 test/unit/serialization/parse_helper_test.exs diff --git a/CHANGELOG.md b/CHANGELOG.md index 15eda62..781cc15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,8 @@ This project adheres to [Semantic Versioning](http://semver.org/) and ### Fixed -- booleans weren't recognized as convertible literals on object positions +- booleans weren't recognized as convertible literals on object positions +- N-Triples and N-Quads decoder didn't handle escaping properly diff --git a/lib/rdf/serialization/parse_helper.ex b/lib/rdf/serialization/parse_helper.ex index a1e0a01..574628a 100644 --- a/lib/rdf/serialization/parse_helper.ex +++ b/lib/rdf/serialization/parse_helper.ex @@ -7,17 +7,17 @@ defmodule RDF.Serialization.ParseHelper do def rdf_type, do: @rdf_type - def to_uri_string({:iriref, line, value}), do: value + def to_uri_string({:iriref, line, value}), do: value |> iri_unescape def to_uri({:iriref, line, value}) do - case URI.parse(value) do + case URI.parse(iri_unescape(value)) do %URI{scheme: nil} -> {:error, line, "#{value} is not a valid URI"} parsed_uri -> {:ok, parsed_uri} end end def to_absolute_or_relative_uri({:iriref, line, value}) do - case URI.parse(value) do + case URI.parse(iri_unescape(value)) do uri = %URI{scheme: scheme} when not is_nil(scheme) -> uri _ -> {:relative_uri, value} end @@ -25,10 +25,10 @@ defmodule RDF.Serialization.ParseHelper do def to_bnode({:blank_node_label, _line, value}), do: RDF.bnode(value) - def to_bnode({:anon, _line}), do: RDF.bnode # TODO: + def to_bnode({:anon, _line}), do: RDF.bnode def to_literal({:string_literal_quote, _line, value}), - do: RDF.literal(value) + do: value |> string_unescape |> RDF.literal def to_literal({:integer, _line, value}), do: RDF.literal(value) def to_literal({:decimal, _line, value}), do: RDF.literal(value) def to_literal({:double, _line, value}), do: RDF.literal(value) @@ -36,7 +36,7 @@ defmodule RDF.Serialization.ParseHelper do def to_literal({:string_literal_quote, _line, value}, {:language, language}), do: RDF.literal(value, language: language) def to_literal({:string_literal_quote, _line, value}, {:datatype, %URI{} = type}), - do: RDF.literal(value, datatype: type) + do: value |> string_unescape |> RDF.literal(datatype: type) def to_literal(string_literal_quote_ast, type), do: {string_literal_quote_ast, type} @@ -58,4 +58,24 @@ defmodule RDF.Serialization.ParseHelper do def prefix_ns(value), do: value |> List.to_string |> String.slice(0..-2) def prefix_ln(value), do: value |> List.to_string |> String.split(":", parts: 2) |> List.to_tuple + + def string_unescape(string), + do: string |> unescape_8digit_unicode_seq |> Macro.unescape_string(&string_unescape_map(&1)) + def iri_unescape(string), + do: string |> unescape_8digit_unicode_seq |> Macro.unescape_string(&iri_unescape_map(&1)) + + defp string_unescape_map(?b), do: ?\b + defp string_unescape_map(?f), do: ?\f + defp string_unescape_map(?n), do: ?\n + defp string_unescape_map(?r), do: ?\r + defp string_unescape_map(?t), do: ?\t + defp string_unescape_map(?u), do: true + defp string_unescape_map(e), do: e + + defp iri_unescape_map(?u), do: true + defp iri_unescape_map(e), do: e + + def unescape_8digit_unicode_seq(string) do + String.replace(string, ~r/\\U([0-9]|[A-F]|[a-f]){2}(([0-9]|[A-F]|[a-f]){6})/, "\\u{\\2}") + end end diff --git a/lib/rdf/serializations/turtle_decoder.ex b/lib/rdf/serializations/turtle_decoder.ex index 4f5fd10..787b68d 100644 --- a/lib/rdf/serializations/turtle_decoder.ex +++ b/lib/rdf/serializations/turtle_decoder.ex @@ -93,7 +93,7 @@ defmodule RDF.Turtle.Decoder do defp resolve_node({:prefix_ln, line_number, {prefix, name}}, statements, state) do if ns = State.ns(state, prefix) do - {RDF.uri(ns <> name), statements, state} + {RDF.uri(ns <> local_name_unescape(name)), statements, state} else raise "line #{line_number}: undefined prefix #{inspect prefix}" end @@ -107,7 +107,6 @@ defmodule RDF.Turtle.Decoder do end end - defp resolve_node({:relative_uri, relative_uri}, _, %State{base_uri: nil}) do raise "Could not resolve resolve relative IRI '#{relative_uri}', no base uri provided" end @@ -163,4 +162,12 @@ defmodule RDF.Turtle.Decoder do defp resolve_node(node, statements, state), do: {node, statements, state} + defp local_name_unescape(string), + do: Macro.unescape_string(string, &local_name_unescape_map(&1)) + + @reserved_characters ~c[~.-!$&'()*+,;=/?#@%_] + + defp local_name_unescape_map(e) when e in @reserved_characters, do: e + defp local_name_unescape_map(_), do: false + end diff --git a/test/unit/serialization/parse_helper_test.exs b/test/unit/serialization/parse_helper_test.exs new file mode 100644 index 0000000..cd97361 --- /dev/null +++ b/test/unit/serialization/parse_helper_test.exs @@ -0,0 +1,48 @@ +defmodule RDF.Serialization.ParseHelperTest do + use ExUnit.Case, async: false + + alias RDF.Serialization.ParseHelper + + @unicode_seq_4digit %{ + ~S"\u0020" => " ", + ~S"" => "", + ~S"\u03B1:a" => "\xCE\xB1:a", + ~S"a\u003Ab" => "a\x3Ab", + } + + @unicode_seq_8digit %{ + ~S"\U00000020" => " ", + ~S"\U00010000" => "\xF0\x90\x80\x80", + ~S"\U000EFFFF" => "\xF3\xAF\xBF\xBF", + } + + + describe "string escaping" do + test "unescaping of \\uXXXX codepoint escape sequences" do + Enum.each @unicode_seq_4digit, fn {input, output} -> + assert ParseHelper.string_unescape(input) == output + end + end + + test "unescaping of \\UXXXXXXXX codepoint escape sequences" do + Enum.each @unicode_seq_8digit, fn {input, output} -> + assert ParseHelper.string_unescape(input) == output + end + end + end + + describe "IRI escaping" do + test "unescaping of \\uXXXX codepoint escape sequences" do + Enum.each @unicode_seq_4digit, fn {input, output} -> + assert ParseHelper.iri_unescape(input) == output + end + end + + test "unescaping of \\UXXXXXXXX codepoint escape sequences" do + Enum.each @unicode_seq_8digit, fn {input, output} -> + assert ParseHelper.iri_unescape(input) == output + end + end + end + +end