rdf-ex/lib/rdf/vocabulary_namespace.ex

defmodule RDF.Vocabulary.Namespace do
  @moduledoc """
  A RDF vocabulary as a `RDF.Namespace`.

  `RDF.Vocabulary.Namespace` modules represent a RDF vocabulary as a `RDF.Namespace`.
  They can be defined with the `defvocab/2` macro of this module.

  RDF.ex comes with predefined modules for some fundamental vocabularies in
  the `RDF.NS` module.
  Furthermore, the [rdf_vocab](https://hex.pm/packages/rdf_vocab) package
  contains predefined modules for popular vocabularies.
  """

  alias RDF.Utils.ResourceClassifier

  @vocabs_dir "priv/vocabs"
  @big_vocab_threshold 300

  defmacro __using__(_opts) do
    quote do
      import unquote(__MODULE__)
    end
  end

  @doc """
  Defines a `RDF.Namespace` module for a RDF vocabulary.
  """
  defmacro defvocab(name, opts) do
    strict   = strict?(opts)
    base_uri = base_uri!(opts)
    file     = filename!(opts)
    {terms, data} =
      case source!(opts) do
        {:terms, terms} -> {terms, nil}
        {:data, data}   -> {rdf_data_vocab_terms(data, base_uri), data}
      end

    if data && RDF.Data.subject_count(data) > @big_vocab_threshold do
      IO.puts("Compiling vocabulary namespace for #{base_uri} may take some time")
    end

    ignored_terms = ignored_terms!(opts)
    terms =
      terms
      |> term_mapping!(opts)
      |> Map.drop(MapSet.to_list(ignored_terms))
      |> validate_terms!
      |> validate_characters!(opts)
      |> validate_case!(data, base_uri, opts)
    case_separated_terms = group_terms_by_case(terms)
    lowercased_terms  = Map.get(case_separated_terms, :lowercased, %{})

    quote do
      vocabdoc = Module.delete_attribute(__MODULE__, :vocabdoc)

      defmodule unquote(name) do
        @moduledoc vocabdoc

        @behaviour RDF.Namespace

        if unquote(file) do
          @external_resource unquote(file)
        end

        @base_uri unquote(base_uri)
        def __base_uri__, do: @base_uri

        @strict unquote(strict)
        def __strict__, do: @strict

        @terms unquote(Macro.escape(terms))
        def __terms__, do: @terms |> Map.keys

        @ignored_terms unquote(Macro.escape(ignored_terms))

        @doc """
        Returns all known URIs of the vocabulary.
        """
        def __uris__ do
          @terms
          |> Enum.map(fn
               {term, true}   -> term_to_uri(@base_uri, term)
               {_alias, term} -> term_to_uri(@base_uri, term)
             end)
          |> Enum.uniq
        end

        define_vocab_terms unquote(lowercased_terms), unquote(base_uri)

        def __resolve_term__(term) do
          case @terms[term] do
            nil ->
              # TODO: Why does this MapSet.member? call produce a warning? It does NOT always yield the same result!
              if @strict or MapSet.member?(@ignored_terms, term) do
                raise RDF.Namespace.UndefinedTermError,
                  "undefined term #{term} in strict vocabulary #{__MODULE__}"
              else
                term_to_uri(@base_uri, term)
              end
            true ->
              term_to_uri(@base_uri, term)
            original_term ->
              term_to_uri(@base_uri, original_term)
          end
        end

        if not @strict do
          def unquote(:"$handle_undefined_function")(term, []) do
            if MapSet.member?(@ignored_terms, term) do
              raise UndefinedFunctionError
            else
              term_to_uri(@base_uri, term)
            end
          end

          def unquote(:"$handle_undefined_function")(term, [subject | objects]) do
            if MapSet.member?(@ignored_terms, term) do
              raise UndefinedFunctionError
            else
              RDF.Description.new(subject, term_to_uri(@base_uri, term), objects)
            end
          end
        end
      end
    end
  end

  @doc false
  defmacro define_vocab_terms(terms, base_uri) do
    terms
    |> Stream.filter(fn
        {term, true} -> valid_term?(term)
        {_, _}       -> true
       end)
    |> Stream.map(fn
        {term, true}          -> {term, term}
        {term, original_term} -> {term, original_term}
       end)
    |> Enum.map(fn {term, uri_suffix} ->
        uri = term_to_uri(base_uri, uri_suffix)
        quote do
          @doc "<#{unquote(to_string(uri))}>"
          def unquote(term)(), do: unquote(Macro.escape(uri))

          @doc "`RDF.Description` builder for `#{unquote(term)}/0`"
          def unquote(term)(subject, object) do
            RDF.Description.new(subject, unquote(Macro.escape(uri)), object)
          end

          # Is there a better way to support multiple objects via arguments?
          @doc false
          def unquote(term)(subject,  o1, o2),
          do: unquote(term)(subject, [o1, o2])
          @doc false
          def unquote(term)(subject,  o1, o2, o3),
          do: unquote(term)(subject, [o1, o2, o3])
          @doc false
          def unquote(term)(subject,  o1, o2, o3, o4),
          do: unquote(term)(subject, [o1, o2, o3, o4])
          @doc false
          def unquote(term)(subject,  o1, o2, o3, o4, o5),
          do: unquote(term)(subject, [o1, o2, o3, o4, o5])
        end
      end)
  end

  defp strict?(opts),
    do: Keyword.get(opts, :strict, true)

  defp base_uri!(opts) do
    base_uri = Keyword.fetch!(opts, :base_uri)
    unless is_binary(base_uri) and String.ends_with?(base_uri, ["/", "#"]) do
      raise RDF.Namespace.InvalidVocabBaseURIError,
              "a base_uri without a trailing '/' or '#' is invalid"
    else
      base_uri
    end
  end

  defp source!(opts) do
    cond do
      Keyword.has_key?(opts, :file)        -> {:data, filename!(opts) |> load_file}
      rdf_data = Keyword.get(opts, :data)  -> {:data, raw_rdf_data(rdf_data)}
      terms    = Keyword.get(opts, :terms) -> {:terms, terms_from_user_input!(terms)}
      true ->
        raise KeyError, key: ~w[terms data file], term: opts
    end
  end

  defp terms_from_user_input!(terms) do
    # TODO: find an alternative to Code.eval_quoted - We want to support that the terms can be given as sigils ...
    {terms, _ } = Code.eval_quoted(terms, [], rdf_data_env())
    Enum.map terms, fn
      term when is_atom(term)   -> term
      term when is_binary(term) -> String.to_atom(term)
      term ->
        raise RDF.Namespace.InvalidTermError,
          "'#{term}' is not a valid vocabulary term"
    end
  end

  defp raw_rdf_data(%RDF.Description{} = rdf_data), do: rdf_data
  defp raw_rdf_data(%RDF.Graph{} = rdf_data), do: rdf_data
  defp raw_rdf_data(%RDF.Dataset{} = rdf_data), do: rdf_data
  defp raw_rdf_data(rdf_data) do
    # TODO: find an alternative to Code.eval_quoted
    {rdf_data, _} = Code.eval_quoted(rdf_data, [], rdf_data_env())
    rdf_data
  end


  defp ignored_terms!(opts) do
    # TODO: find an alternative to Code.eval_quoted - We want to support that the terms can be given as sigils ...
    with terms = Keyword.get(opts, :ignore, []) do
      {terms, _ } = Code.eval_quoted(terms, [], rdf_data_env())
      terms
      |> Enum.map(fn
           term when is_atom(term)   -> term
           term when is_binary(term) -> String.to_atom(term)
           term -> raise RDF.Namespace.InvalidTermError, inspect(term)
         end)
      |> MapSet.new
    end
  end


  defp term_mapping!(terms, opts) do
    terms = Map.new terms, fn
      term when is_atom(term) -> {term, true}
      term                    -> {String.to_atom(term), true}
    end
    Keyword.get(opts, :alias, [])
    |> Enum.reduce(terms, fn {alias, original_term}, terms ->
         term = String.to_atom(original_term)
         cond do
           not valid_characters?(alias) ->
             raise RDF.Namespace.InvalidAliasError,
               "alias '#{alias}' contains invalid characters"

           Map.get(terms, alias) == true ->
             raise RDF.Namespace.InvalidAliasError,
               "alias '#{alias}' already defined"

           strict?(opts) and not Map.has_key?(terms, term) ->
              raise RDF.Namespace.InvalidAliasError,
                "term '#{original_term}' is not a term in this vocabulary"

           Map.get(terms, term, true) != true ->
              raise RDF.Namespace.InvalidAliasError,
                "'#{original_term}' is already an alias"

           true ->
             Map.put(terms, alias, to_string(original_term))
         end
       end)
  end

  defp aliased_terms(terms) do
    terms
    |> Map.values
    |> MapSet.new
    |> MapSet.delete(true)
    |> Enum.map(&String.to_atom/1)
  end

  @invalid_terms MapSet.new ~w[
    and
    or
    xor
    in
    fn
    def
    when
    if
    for
    case
    with
    quote
    unquote
    unquote_splicing
    alias
    import
    require
    super
    __aliases__
  ]a

  def invalid_terms, do: @invalid_terms

  defp validate_terms!(terms) do
    with aliased_terms = aliased_terms(terms) do
      for {term, _} <- terms, not term in aliased_terms and not valid_term?(term) do
        term
      end
      |> handle_invalid_terms!
    end

    terms
  end

  defp valid_term?(term) do
    not MapSet.member?(@invalid_terms, term)
  end

  defp handle_invalid_terms!([]), do: nil

  defp handle_invalid_terms!(invalid_terms) do
    raise RDF.Namespace.InvalidTermError, """
      The following terms can not be used, because they conflict with the Elixir semantics:

      - #{Enum.join(invalid_terms, "\n- ")}

      You have the following options:

      - define an alias with the :alias option on defvocab
      - ignore the resource with the :ignore option on defvocab
      """
  end


  defp validate_characters!(terms, opts) do
    if (handling = Keyword.get(opts, :invalid_characters, :fail)) == :ignore do
      terms
    else
      terms
      |> detect_invalid_characters
      |> handle_invalid_characters(handling, terms)
    end
  end

  defp detect_invalid_characters(terms) do
    with aliased_terms = aliased_terms(terms) do
      for {term, _} <- terms, not term in aliased_terms and not valid_characters?(term),
        do: term
    end
  end

  defp handle_invalid_characters([], _, terms), do: terms

  defp handle_invalid_characters(invalid_terms, :fail, _) do
    raise RDF.Namespace.InvalidTermError, """
      The following terms contain invalid characters:

      - #{Enum.join(invalid_terms, "\n- ")}

      You have the following options:

      - if you are in control of the vocabulary, consider renaming the resource
      - define an alias with the :alias option on defvocab
      - change the handling of invalid characters with the :invalid_characters option on defvocab
      - ignore the resource with the :ignore option on defvocab
      """
  end

  defp handle_invalid_characters(invalid_terms, :warn, terms) do
    Enum.each invalid_terms, fn term ->
      IO.warn "'#{term}' is not valid term, since it contains invalid characters"
    end
    terms
  end

  defp valid_characters?(term) when is_atom(term),
    do: valid_characters?(Atom.to_string(term))
  defp valid_characters?(term),
    do: Regex.match?(~r/^[a-zA-Z_]\w*$/, term)

  defp validate_case!(terms, nil, _, _), do: terms
  defp validate_case!(terms, data, base_uri, opts) do
    if (handling = Keyword.get(opts, :case_violations, :warn)) == :ignore do
      terms
    else
      terms
      |> detect_case_violations(data, base_uri)
      |> group_case_violations
      |> handle_case_violations(handling, terms, base_uri, opts)
    end
  end

  defp detect_case_violations(terms, data, base_uri) do
    aliased_terms = aliased_terms(terms)
    terms
    |> Enum.filter(fn {term, _} ->
         not(Atom.to_string(term) |> String.starts_with?("_"))
       end)
    |> Enum.filter(fn
         {term, true} ->
           if not term in aliased_terms do
             proper_case?(term, base_uri, Atom.to_string(term), data)
           end
         {term, original_term} ->
           proper_case?(term, base_uri, original_term, data)
       end)
  end

  defp proper_case?(term, base_uri, uri_suffix, data) do
    case ResourceClassifier.property?(term_to_uri(base_uri, uri_suffix), data) do
      true  -> not lowercase?(term)
      false -> lowercase?(term)
      nil   -> lowercase?(term)
    end
  end

  defp group_case_violations(violations) do
    violations
    |> Enum.group_by(fn
         {term, true} ->
           if lowercase?(term),
             do:   :lowercased_term,
             else: :capitalized_term
         {term, _original} ->
           if lowercase?(term),
             do:   :lowercased_alias,
             else: :capitalized_alias
       end)
  end

  defp handle_case_violations(%{} = violations, _, terms, _, _) when map_size(violations) == 0,
    do: terms

  defp handle_case_violations(violations, :fail, _, base_uri, _) do
    resource_name_violations = fn violations ->
      violations
      |> Enum.map(fn {term, true} -> term_to_uri(base_uri, term) end)
      |> Enum.map(&to_string/1)
      |> Enum.join("\n- ")
    end
    alias_violations = fn violations ->
      violations
      |> Enum.map(fn {term, original} ->
          "alias #{term} for #{term_to_uri(base_uri, original)}"
         end)
      |> Enum.join("\n- ")
    end

    violation_error_lines =
      violations
      |> Enum.map(fn
          {:capitalized_term, violations} ->
            """
            Terms for properties should be lowercased, but the following properties are
            capitalized:

            - #{resource_name_violations.(violations)}

            """
          {:lowercased_term, violations} ->
            """
            Terms for non-property resource should be capitalized, but the following
            non-properties are lowercased:

            - #{resource_name_violations.(violations)}

            """
          {:capitalized_alias, violations} ->
            """
            Terms for properties should be lowercased, but the following aliases for
            properties are capitalized:

            - #{alias_violations.(violations)}

            """
          {:lowercased_alias, violations} ->
            """
            Terms for non-property resource should be capitalized, but the following
            aliases for non-properties are lowercased:

            - #{alias_violations.(violations)}

            """
         end)
      |> Enum.join

    raise RDF.Namespace.InvalidTermError, """
      Case violations detected

      #{violation_error_lines}
      You have the following options:

      - if you are in control of the vocabulary, consider renaming the resource
      - define a properly cased alias with the :alias option on defvocab
      - change the handling of case violations with the :case_violations option on defvocab
      - ignore the resource with the :ignore option on defvocab
      """
  end


  defp handle_case_violations(violations, :warn, terms, base_uri, _) do
    for {type, violations} <- violations,
        {term, original}   <- violations do
      case_violation_warning(type, term, original, base_uri)
    end
    terms
  end

  defp case_violation_warning(:capitalized_term, term, _, base_uri) do
    IO.warn "'#{term_to_uri(base_uri, term)}' is a capitalized property"
  end

  defp case_violation_warning(:lowercased_term, term, _, base_uri) do
    IO.warn "'#{term_to_uri(base_uri, term)}' is a lowercased non-property resource"
  end

  defp case_violation_warning(:capitalized_alias, term, _, _) do
    IO.warn "capitalized alias '#{term}' for a property"
  end

  defp case_violation_warning(:lowercased_alias, term, _, _) do
    IO.warn "lowercased alias '#{term}' for a non-property resource"
  end


  defp filename!(opts) do
    if filename = Keyword.get(opts, :file) do
      cond do
        File.exists?(filename) ->
          filename
        File.exists?(expanded_filename = Path.expand(filename, @vocabs_dir)) ->
          expanded_filename
        true ->
          raise File.Error, path: filename, action: "find", reason: :enoent
       end
    end
  end

  defp load_file(file) do
    # TODO: support other formats
    cond do
      String.ends_with?(file, ".nt")  -> RDF.NTriples.read_file!(file)
      String.ends_with?(file, ".nq")  -> RDF.NQuads.read_file!(file)
      String.ends_with?(file, ".ttl") -> RDF.Turtle.read_file!(file)
      true ->
        raise ArgumentError,
          "unsupported file type for #{file}: vocabulary namespaces can currently be created from NTriple, NQuad and Turtle files"
    end
  end

  defp rdf_data_env do
    import RDF.Sigils
    __ENV__
  end

  defp rdf_data_vocab_terms(data, base_uri) do
    data
    |> RDF.Data.resources
    # filter URIs
    |> Stream.filter(fn
        %URI{} -> true
        _      -> false
       end)
    |> Stream.map(&URI.to_string/1)
    |> Stream.map(&(strip_base_uri(&1, base_uri)))
    |> Stream.filter(&vocab_term?/1)
    |> Enum.map(&String.to_atom/1)
  end

  defp group_terms_by_case(terms) do
    terms
    |> Enum.group_by(fn {term, _} ->
         if lowercase?(term),
           do:   :lowercased,
           else: :capitalized
       end)
    |> Map.new(fn {group, term_mapping} ->
         {group, Map.new(term_mapping)}
       end)
  end

  defp lowercase?(term) when is_atom(term),
    do: Atom.to_string(term) |> lowercase?
  defp lowercase?(term),
    do: term =~ ~r/^(_|\p{Ll})/u

  defp strip_base_uri(uri, base_uri) do
    if String.starts_with?(uri, base_uri) do
      String.replace_prefix(uri, base_uri, "")
    end
  end

  defp vocab_term?(""), do: false
  defp vocab_term?(term) when is_binary(term) do
    not String.contains?(term, "/")
  end
  defp vocab_term?(_), do: false

  @doc false
  def term_to_uri(base_uri, term) when is_atom(term),
    do: term_to_uri(base_uri, Atom.to_string(term))
  def term_to_uri(base_uri, term),
    do: URI.parse(base_uri <> term)

end