diff --git a/CHANGELOG.md b/CHANGELOG.md index 580757e..911908f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ This project adheres to [Semantic Versioning](http://semver.org/) and - general serialization functions for reading from and writing to streams and implementations for N-Triples and N-Quads (Turtle still to come) +- a `:gzip` option flag on all `read_file/3` and `write_file/3` functions + allows to and read and write all supported serialization formats from and to + gzipped files (works also with the new possibility to read and write files via streams) - `RDF.Dataset.prefixes/1` for getting an aggregated `RDF.PrefixMap` over all graphs - `RDF.PrefixMap.put/3` for adding a prefix mapping and overwrite an existing one - `RDF.BlankNode.value/1` for getting the internal string representation of a blank node diff --git a/lib/rdf/serialization/format.ex b/lib/rdf/serialization/format.ex index 589f896..175c413 100644 --- a/lib/rdf/serialization/format.ex +++ b/lib/rdf/serialization/format.ex @@ -117,6 +117,14 @@ defmodule RDF.Serialization.Format do It returns an `{:ok, data}` tuple, with `data` being the deserialized graph or dataset, or `{:error, reason}` if an error occurs. + ## Options + + General serialization-independent options: + + - `:gzip`: Allows to read directly from a gzipped file (default: `false`) + - `:file_mode`: A list with the Elixir `File.open` modes to be used for reading + (default: `[:read, :utf8]`) + #{@decoder_doc_ref} """ @spec read_file(Path.t(), keyword) :: {:ok, Graph.t() | Dataset.t()} | {:error, any} @@ -127,6 +135,8 @@ defmodule RDF.Serialization.Format do As opposed to `read_file/2`, it raises an exception if an error occurs. + See `read_file/3` for the available format-independent options. + #{@decoder_doc_ref} """ @spec read_file!(Path.t(), keyword) :: Graph.t() | Dataset.t() @@ -178,9 +188,10 @@ defmodule RDF.Serialization.Format do General serialization-independent options: - - `:force` - If not set to `true`, an error is raised when the given file + - `:gzip`: Allows to write directly to a gzipped file (default: `false`) + - `:force`: If not set to `true`, an error is raised when the given file already exists (default: `false`) - - `:file_mode` - A list with the Elixir `File.open` modes to be used for writing + - `:file_mode`: A list with the Elixir `File.open` modes to be used for writing (default: `[:write, :exclusive]`) #{@encoder_doc_ref} diff --git a/lib/rdf/serialization/reader.ex b/lib/rdf/serialization/reader.ex index 2433b8f..3edfbec 100644 --- a/lib/rdf/serialization/reader.ex +++ b/lib/rdf/serialization/reader.ex @@ -9,6 +9,8 @@ defmodule RDF.Serialization.Reader do alias RDF.{Serialization, Dataset, Graph} + @default_file_mode ~w[read utf8]a + @spec read_string(module, String.t(), keyword) :: {:ok, Graph.t() | Dataset.t()} | {:error, any} def read_string(decoder, content, opts \\ []) do decoder.decode(content, opts) @@ -36,16 +38,19 @@ defmodule RDF.Serialization.Reader do end defp do_read_file(false, decoder, file, opts) do - case File.read(file) do + file + |> File.open(file_mode(decoder, opts), &IO.read(&1, :all)) + |> case do + {:ok, {:error, error}} -> {:error, error} {:ok, content} -> decoder.decode(content, opts) - {:error, reason} -> {:error, reason} + {:error, error} -> {:error, error} end end defp do_read_file(true, decoder, file, opts) do {:ok, file - |> File.stream!() + |> File.stream!(file_mode(decoder, opts)) |> decoder.decode_from_stream(opts)} rescue error in RuntimeError -> {:error, error.message} @@ -61,13 +66,28 @@ defmodule RDF.Serialization.Reader do defp do_read_file!(false, decoder, file, opts) do file - |> File.read!() - |> decoder.decode!(opts) + |> File.open!(file_mode(decoder, opts), &IO.read(&1, :all)) + |> case do + {:error, error} when is_tuple(error) -> error |> inspect() |> raise() + {:error, error} -> raise(error) + content -> decoder.decode!(content, opts) + end end defp do_read_file!(true, decoder, file, opts) do file - |> File.stream!() + |> File.stream!(file_mode(decoder, opts)) |> decoder.decode_from_stream(opts) end + + @doc false + def file_mode(_decoder, opts) do + opts + |> Keyword.get(:file_mode, @default_file_mode) + |> List.wrap() + |> set_gzip(Keyword.get(opts, :gzip)) + end + + defp set_gzip(file_mode, true), do: [:compressed | file_mode] + defp set_gzip(file_mode, _), do: file_mode end diff --git a/lib/rdf/serialization/serialization.ex b/lib/rdf/serialization/serialization.ex index 877d4ea..984f2ea 100644 --- a/lib/rdf/serialization/serialization.ex +++ b/lib/rdf/serialization/serialization.ex @@ -187,10 +187,18 @@ defmodule RDF.Serialization do It returns an `{:ok, data}` tuple, with `data` being the deserialized graph or dataset, or `{:error, reason}` if an error occurs. + ## Options + The format can be specified with the `format` option and a format name or the `media_type` option and the media type of the format. If none of these are given, the format gets inferred from the extension of the given file name. + Other available serialization-independent options: + + - `:gzip`: Allows to read directly from a gzipped file (default: `false`) + - `:file_mode`: A list with the Elixir `File.open` modes to be used for reading + (default: `[:read, :utf8]`) + Please refer to the documentation of the decoder of a RDF serialization format for format-specific options. """ @@ -292,9 +300,10 @@ defmodule RDF.Serialization do Other available serialization-independent options: - - `:force` - If not set to `true`, an error is raised when the given file + - `:gzip`: Allows to write directly to a gzipped file (default: `false`) + - `:force`: If not set to `true`, an error is raised when the given file already exists (default: `false`) - - `:file_mode` - A list with the Elixir `File.open` modes to be used for writing + - `:file_mode`: A list with the Elixir `File.open` modes to be used for writing (default: `[:write, :exclusive]`) Please refer to the documentation of the encoder of a RDF serialization format diff --git a/lib/rdf/serialization/writer.ex b/lib/rdf/serialization/writer.ex index a0a3f9f..f78495c 100644 --- a/lib/rdf/serialization/writer.ex +++ b/lib/rdf/serialization/writer.ex @@ -65,6 +65,7 @@ defmodule RDF.Serialization.Writer do encoded_string = encoder.encode!(data, opts) File.write!(path, encoded_string, file_mode(encoder, opts)) end + defp do_write_file!(true, encoder, data, path, opts) do data |> encoder.stream(opts) @@ -73,15 +74,18 @@ defmodule RDF.Serialization.Writer do :ok end + @doc false + def file_mode(_encoder, opts) do + opts + |> Keyword.get(:file_mode, @default_file_mode) + |> List.wrap() + |> set_force(Keyword.get(opts, :force)) + |> set_gzip(Keyword.get(opts, :gzip)) end - defp file_mode(_encoder, opts) do - file_mode = Keyword.get(opts, :file_mode, @default_file_mode) + defp set_force(file_mode, true), do: List.delete(file_mode, :exclusive) + defp set_force(file_mode, _), do: file_mode - if Keyword.get(opts, :force) do - List.delete(file_mode, :exclusive) - else - file_mode - end - end + defp set_gzip(file_mode, true), do: [:compressed | file_mode] + defp set_gzip(file_mode, _), do: file_mode end diff --git a/test/unit/serialization/reader_test.exs b/test/unit/serialization/reader_test.exs new file mode 100644 index 0000000..3384a56 --- /dev/null +++ b/test/unit/serialization/reader_test.exs @@ -0,0 +1,19 @@ +defmodule RDF.Serialization.ReaderTest do + use RDF.Test.Case + + doctest RDF.Serialization.Reader + + alias RDF.Serialization.Reader + alias RDF.Turtle + + describe "file_mode/2" do + test ":gzip without other :file_mode opts" do + assert Reader.file_mode(Turtle.Decoder, gzip: true) == ~w[compressed read utf8]a + end + + test ":gzip with other :file_mode opts" do + assert Reader.file_mode(Turtle.Decoder, gzip: true, file_mode: [:charlist]) == + ~w[compressed charlist]a + end + end +end diff --git a/test/unit/serialization/serialization_test.exs b/test/unit/serialization/serialization_test.exs index de7a394..aad265b 100644 --- a/test/unit/serialization/serialization_test.exs +++ b/test/unit/serialization/serialization_test.exs @@ -260,6 +260,79 @@ defmodule RDF.SerializationTest do end end + test ":gzip opt" do + # first ensure that :gzip is not ignored on both read and write which would lead to a false positive + file = file("gzip_test.gz") + Serialization.write_file!(@example_graph, file, format: :turtle, gzip: true, force: true) + assert_raise RuntimeError, fn -> Serialization.read_file!(file, format: :turtle) end + + Serialization.write_file!(@example_graph, file, + format: :ntriples, + gzip: true, + stream: true, + force: true + ) + + # Why do we get an UndefinedFunctionError (function :unicode.format_error/1 is undefined or private) + assert_raise UndefinedFunctionError, fn -> + Serialization.read_file!(file, format: :ntriples, stream: true) + end + + :ok = Serialization.write_file(@example_graph, file, format: :turtle, gzip: true, force: true) + assert {:error, _} = Serialization.read_file(file, format: :turtle) + + :ok = + Serialization.write_file(@example_graph, file, + format: :ntriples, + gzip: true, + stream: true, + force: true + ) + + assert {:error, _} = Serialization.read_file(file, format: :ntriples, stream: true) + + # start of the actual tests + assert :ok = + Serialization.write_file(@example_graph, file, + format: :turtle, + gzip: true, + force: true + ) + + assert Serialization.read_file(file, format: :turtle, gzip: true) == {:ok, @example_graph} + + assert :ok = + Serialization.write_file(@example_graph, file, + format: :ntriples, + gzip: true, + stream: true, + force: true + ) + + assert Serialization.read_file(file, format: :ntriples, stream: true, gzip: true) == + {:ok, Graph.clear_metadata(@example_graph)} + + assert :ok = + Serialization.write_file!(@example_graph, file, + format: :turtle, + gzip: true, + force: true + ) + + assert Serialization.read_file!(file, format: :turtle, gzip: true) == @example_graph + + assert :ok = + Serialization.write_file!(@example_graph, file, + format: :ntriples, + gzip: true, + stream: true, + force: true + ) + + assert Serialization.read_file!(file, format: :ntriples, stream: true, gzip: true) == + Graph.clear_metadata(@example_graph) + end + describe "use_file_streaming/2" do test "without opts" do refute Serialization.use_file_streaming(NTriples.Decoder, []) diff --git a/test/unit/serialization/writer_test.exs b/test/unit/serialization/writer_test.exs new file mode 100644 index 0000000..e68d869 --- /dev/null +++ b/test/unit/serialization/writer_test.exs @@ -0,0 +1,23 @@ +defmodule RDF.Serialization.WriterTest do + use RDF.Test.Case + + doctest RDF.Serialization.Writer + + alias RDF.Serialization.Writer + alias RDF.Turtle + + describe "file_mode/2" do + test ":force" do + assert Writer.file_mode(Turtle.Encoder, force: true) == ~w[write]a + end + + test ":gzip without other :file_mode opts" do + assert Writer.file_mode(Turtle.Encoder, gzip: true) == ~w[compressed write exclusive]a + end + + test ":gzip with other :file_mode opts" do + assert Writer.file_mode(Turtle.Encoder, gzip: true, file_mode: [:append]) == + ~w[compressed append]a + end + end +end