Add :gzip opt on all read and write file serialization functions

This commit is contained in:
Marcel Otto 2020-11-05 17:23:59 +01:00
parent d3f66bd5d9
commit 41a299e122
8 changed files with 180 additions and 18 deletions

View file

@ -11,6 +11,9 @@ This project adheres to [Semantic Versioning](http://semver.org/) and
- general serialization functions for reading from and writing to streams - general serialization functions for reading from and writing to streams
and implementations for N-Triples and N-Quads (Turtle still to come) and implementations for N-Triples and N-Quads (Turtle still to come)
- a `:gzip` option flag on all `read_file/3` and `write_file/3` functions
allows to and read and write all supported serialization formats from and to
gzipped files (works also with the new possibility to read and write files via streams)
- `RDF.Dataset.prefixes/1` for getting an aggregated `RDF.PrefixMap` over all graphs - `RDF.Dataset.prefixes/1` for getting an aggregated `RDF.PrefixMap` over all graphs
- `RDF.PrefixMap.put/3` for adding a prefix mapping and overwrite an existing one - `RDF.PrefixMap.put/3` for adding a prefix mapping and overwrite an existing one
- `RDF.BlankNode.value/1` for getting the internal string representation of a blank node - `RDF.BlankNode.value/1` for getting the internal string representation of a blank node

View file

@ -117,6 +117,14 @@ defmodule RDF.Serialization.Format do
It returns an `{:ok, data}` tuple, with `data` being the deserialized graph or It returns an `{:ok, data}` tuple, with `data` being the deserialized graph or
dataset, or `{:error, reason}` if an error occurs. dataset, or `{:error, reason}` if an error occurs.
## Options
General serialization-independent options:
- `:gzip`: Allows to read directly from a gzipped file (default: `false`)
- `:file_mode`: A list with the Elixir `File.open` modes to be used for reading
(default: `[:read, :utf8]`)
#{@decoder_doc_ref} #{@decoder_doc_ref}
""" """
@spec read_file(Path.t(), keyword) :: {:ok, Graph.t() | Dataset.t()} | {:error, any} @spec read_file(Path.t(), keyword) :: {:ok, Graph.t() | Dataset.t()} | {:error, any}
@ -127,6 +135,8 @@ defmodule RDF.Serialization.Format do
As opposed to `read_file/2`, it raises an exception if an error occurs. As opposed to `read_file/2`, it raises an exception if an error occurs.
See `read_file/3` for the available format-independent options.
#{@decoder_doc_ref} #{@decoder_doc_ref}
""" """
@spec read_file!(Path.t(), keyword) :: Graph.t() | Dataset.t() @spec read_file!(Path.t(), keyword) :: Graph.t() | Dataset.t()
@ -178,9 +188,10 @@ defmodule RDF.Serialization.Format do
General serialization-independent options: General serialization-independent options:
- `:force` - If not set to `true`, an error is raised when the given file - `:gzip`: Allows to write directly to a gzipped file (default: `false`)
- `:force`: If not set to `true`, an error is raised when the given file
already exists (default: `false`) already exists (default: `false`)
- `:file_mode` - A list with the Elixir `File.open` modes to be used for writing - `:file_mode`: A list with the Elixir `File.open` modes to be used for writing
(default: `[:write, :exclusive]`) (default: `[:write, :exclusive]`)
#{@encoder_doc_ref} #{@encoder_doc_ref}

View file

@ -9,6 +9,8 @@ defmodule RDF.Serialization.Reader do
alias RDF.{Serialization, Dataset, Graph} alias RDF.{Serialization, Dataset, Graph}
@default_file_mode ~w[read utf8]a
@spec read_string(module, String.t(), keyword) :: {:ok, Graph.t() | Dataset.t()} | {:error, any} @spec read_string(module, String.t(), keyword) :: {:ok, Graph.t() | Dataset.t()} | {:error, any}
def read_string(decoder, content, opts \\ []) do def read_string(decoder, content, opts \\ []) do
decoder.decode(content, opts) decoder.decode(content, opts)
@ -36,16 +38,19 @@ defmodule RDF.Serialization.Reader do
end end
defp do_read_file(false, decoder, file, opts) do defp do_read_file(false, decoder, file, opts) do
case File.read(file) do file
|> File.open(file_mode(decoder, opts), &IO.read(&1, :all))
|> case do
{:ok, {:error, error}} -> {:error, error}
{:ok, content} -> decoder.decode(content, opts) {:ok, content} -> decoder.decode(content, opts)
{:error, reason} -> {:error, reason} {:error, error} -> {:error, error}
end end
end end
defp do_read_file(true, decoder, file, opts) do defp do_read_file(true, decoder, file, opts) do
{:ok, {:ok,
file file
|> File.stream!() |> File.stream!(file_mode(decoder, opts))
|> decoder.decode_from_stream(opts)} |> decoder.decode_from_stream(opts)}
rescue rescue
error in RuntimeError -> {:error, error.message} error in RuntimeError -> {:error, error.message}
@ -61,13 +66,28 @@ defmodule RDF.Serialization.Reader do
defp do_read_file!(false, decoder, file, opts) do defp do_read_file!(false, decoder, file, opts) do
file file
|> File.read!() |> File.open!(file_mode(decoder, opts), &IO.read(&1, :all))
|> decoder.decode!(opts) |> case do
{:error, error} when is_tuple(error) -> error |> inspect() |> raise()
{:error, error} -> raise(error)
content -> decoder.decode!(content, opts)
end
end end
defp do_read_file!(true, decoder, file, opts) do defp do_read_file!(true, decoder, file, opts) do
file file
|> File.stream!() |> File.stream!(file_mode(decoder, opts))
|> decoder.decode_from_stream(opts) |> decoder.decode_from_stream(opts)
end end
@doc false
def file_mode(_decoder, opts) do
opts
|> Keyword.get(:file_mode, @default_file_mode)
|> List.wrap()
|> set_gzip(Keyword.get(opts, :gzip))
end
defp set_gzip(file_mode, true), do: [:compressed | file_mode]
defp set_gzip(file_mode, _), do: file_mode
end end

View file

@ -187,10 +187,18 @@ defmodule RDF.Serialization do
It returns an `{:ok, data}` tuple, with `data` being the deserialized graph or It returns an `{:ok, data}` tuple, with `data` being the deserialized graph or
dataset, or `{:error, reason}` if an error occurs. dataset, or `{:error, reason}` if an error occurs.
## Options
The format can be specified with the `format` option and a format name or the The format can be specified with the `format` option and a format name or the
`media_type` option and the media type of the format. If none of these are `media_type` option and the media type of the format. If none of these are
given, the format gets inferred from the extension of the given file name. given, the format gets inferred from the extension of the given file name.
Other available serialization-independent options:
- `:gzip`: Allows to read directly from a gzipped file (default: `false`)
- `:file_mode`: A list with the Elixir `File.open` modes to be used for reading
(default: `[:read, :utf8]`)
Please refer to the documentation of the decoder of a RDF serialization format Please refer to the documentation of the decoder of a RDF serialization format
for format-specific options. for format-specific options.
""" """
@ -292,9 +300,10 @@ defmodule RDF.Serialization do
Other available serialization-independent options: Other available serialization-independent options:
- `:force` - If not set to `true`, an error is raised when the given file - `:gzip`: Allows to write directly to a gzipped file (default: `false`)
- `:force`: If not set to `true`, an error is raised when the given file
already exists (default: `false`) already exists (default: `false`)
- `:file_mode` - A list with the Elixir `File.open` modes to be used for writing - `:file_mode`: A list with the Elixir `File.open` modes to be used for writing
(default: `[:write, :exclusive]`) (default: `[:write, :exclusive]`)
Please refer to the documentation of the encoder of a RDF serialization format Please refer to the documentation of the encoder of a RDF serialization format

View file

@ -65,6 +65,7 @@ defmodule RDF.Serialization.Writer do
encoded_string = encoder.encode!(data, opts) encoded_string = encoder.encode!(data, opts)
File.write!(path, encoded_string, file_mode(encoder, opts)) File.write!(path, encoded_string, file_mode(encoder, opts))
end end
defp do_write_file!(true, encoder, data, path, opts) do defp do_write_file!(true, encoder, data, path, opts) do
data data
|> encoder.stream(opts) |> encoder.stream(opts)
@ -73,15 +74,18 @@ defmodule RDF.Serialization.Writer do
:ok :ok
end end
@doc false
def file_mode(_encoder, opts) do
opts
|> Keyword.get(:file_mode, @default_file_mode)
|> List.wrap()
|> set_force(Keyword.get(opts, :force))
|> set_gzip(Keyword.get(opts, :gzip))
end end
defp file_mode(_encoder, opts) do defp set_force(file_mode, true), do: List.delete(file_mode, :exclusive)
file_mode = Keyword.get(opts, :file_mode, @default_file_mode) defp set_force(file_mode, _), do: file_mode
if Keyword.get(opts, :force) do defp set_gzip(file_mode, true), do: [:compressed | file_mode]
List.delete(file_mode, :exclusive) defp set_gzip(file_mode, _), do: file_mode
else
file_mode
end
end
end end

View file

@ -0,0 +1,19 @@
defmodule RDF.Serialization.ReaderTest do
use RDF.Test.Case
doctest RDF.Serialization.Reader
alias RDF.Serialization.Reader
alias RDF.Turtle
describe "file_mode/2" do
test ":gzip without other :file_mode opts" do
assert Reader.file_mode(Turtle.Decoder, gzip: true) == ~w[compressed read utf8]a
end
test ":gzip with other :file_mode opts" do
assert Reader.file_mode(Turtle.Decoder, gzip: true, file_mode: [:charlist]) ==
~w[compressed charlist]a
end
end
end

View file

@ -260,6 +260,79 @@ defmodule RDF.SerializationTest do
end end
end end
test ":gzip opt" do
# first ensure that :gzip is not ignored on both read and write which would lead to a false positive
file = file("gzip_test.gz")
Serialization.write_file!(@example_graph, file, format: :turtle, gzip: true, force: true)
assert_raise RuntimeError, fn -> Serialization.read_file!(file, format: :turtle) end
Serialization.write_file!(@example_graph, file,
format: :ntriples,
gzip: true,
stream: true,
force: true
)
# Why do we get an UndefinedFunctionError (function :unicode.format_error/1 is undefined or private)
assert_raise UndefinedFunctionError, fn ->
Serialization.read_file!(file, format: :ntriples, stream: true)
end
:ok = Serialization.write_file(@example_graph, file, format: :turtle, gzip: true, force: true)
assert {:error, _} = Serialization.read_file(file, format: :turtle)
:ok =
Serialization.write_file(@example_graph, file,
format: :ntriples,
gzip: true,
stream: true,
force: true
)
assert {:error, _} = Serialization.read_file(file, format: :ntriples, stream: true)
# start of the actual tests
assert :ok =
Serialization.write_file(@example_graph, file,
format: :turtle,
gzip: true,
force: true
)
assert Serialization.read_file(file, format: :turtle, gzip: true) == {:ok, @example_graph}
assert :ok =
Serialization.write_file(@example_graph, file,
format: :ntriples,
gzip: true,
stream: true,
force: true
)
assert Serialization.read_file(file, format: :ntriples, stream: true, gzip: true) ==
{:ok, Graph.clear_metadata(@example_graph)}
assert :ok =
Serialization.write_file!(@example_graph, file,
format: :turtle,
gzip: true,
force: true
)
assert Serialization.read_file!(file, format: :turtle, gzip: true) == @example_graph
assert :ok =
Serialization.write_file!(@example_graph, file,
format: :ntriples,
gzip: true,
stream: true,
force: true
)
assert Serialization.read_file!(file, format: :ntriples, stream: true, gzip: true) ==
Graph.clear_metadata(@example_graph)
end
describe "use_file_streaming/2" do describe "use_file_streaming/2" do
test "without opts" do test "without opts" do
refute Serialization.use_file_streaming(NTriples.Decoder, []) refute Serialization.use_file_streaming(NTriples.Decoder, [])

View file

@ -0,0 +1,23 @@
defmodule RDF.Serialization.WriterTest do
use RDF.Test.Case
doctest RDF.Serialization.Writer
alias RDF.Serialization.Writer
alias RDF.Turtle
describe "file_mode/2" do
test ":force" do
assert Writer.file_mode(Turtle.Encoder, force: true) == ~w[write]a
end
test ":gzip without other :file_mode opts" do
assert Writer.file_mode(Turtle.Encoder, gzip: true) == ~w[compressed write exclusive]a
end
test ":gzip with other :file_mode opts" do
assert Writer.file_mode(Turtle.Encoder, gzip: true, file_mode: [:append]) ==
~w[compressed append]a
end
end
end