diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1637de9..473b5f1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -24,6 +24,9 @@ jobs: - name: Check mix format run: mix format --check-formatted + - name: Start Typesense + run: docker compose up -d typesense + - name: Run tests run: | - mix test + mix test --include typesense diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..4b83171 --- /dev/null +++ b/compose.yml @@ -0,0 +1,6 @@ +services: + typesense: + image: typesense/typesense:27.1 + command: --data-dir /tmp --api-key=hexdocs + ports: + - 8108:8108 diff --git a/config/config.exs b/config/config.exs index 12ea307..92d3e70 100644 --- a/config/config.exs +++ b/config/config.exs @@ -4,9 +4,13 @@ config :hexdocs, port: "4002", hexpm_url: "http://localhost:4000", hexpm_secret: "2cd6d09334d4b00a2be4d532342b799b", + typesense_url: "http://localhost:8108", + typesense_api_key: "hexdocs", + typesense_collection: "hexdocs", hexpm_impl: Hexdocs.Hexpm.Impl, store_impl: Hexdocs.Store.Local, cdn_impl: Hexdocs.CDN.Local, + search_impl: Hexdocs.Search.Local, source_repo_impl: Hexdocs.SourceRepo.GitHub, tmp_dir: "tmp", queue_id: "test", diff --git a/config/dev.exs b/config/dev.exs index bedb75e..f804867 100644 --- a/config/dev.exs +++ b/config/dev.exs @@ -5,4 +5,5 @@ config :hexdocs, hexpm_url: "http://localhost:4000", hexpm_impl: Hexdocs.Hexpm.Impl, store_impl: Hexdocs.Store.Local, - cdn_impl: Hexdocs.CDN.Local + cdn_impl: Hexdocs.CDN.Local, + search_impl: Hexdocs.Search.Local diff --git a/config/prod.exs b/config/prod.exs index 6a10cd6..6f5a1dc 100644 --- a/config/prod.exs +++ b/config/prod.exs @@ -4,6 +4,7 @@ config :hexdocs, hexpm_impl: Hexdocs.Hexpm.Impl, store_impl: Hexdocs.Store.Impl, cdn_impl: Hexdocs.CDN.Fastly, + search_impl: Hexdocs.Search.Typesense, queue_producer: BroadwaySQS.Producer, gcs_put_debounce: 3000 diff --git a/config/runtime.exs b/config/runtime.exs index bab3fe5..04dd2ea 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -5,6 +5,9 @@ if config_env() == :prod do port: System.fetch_env!("HEXDOCS_PORT"), hexpm_url: System.fetch_env!("HEXDOCS_HEXPM_URL"), hexpm_secret: System.fetch_env!("HEXDOCS_HEXPM_SECRET"), + typesense_url: System.fetch_env!("HEXDOCS_TYPESENSE_URL"), + typesense_api_key: System.fetch_env!("HEXDOCS_TYPESENSE_API_KEY"), + typesense_collection: System.fetch_env!("HEXDOCS_TYPESENSE_COLLECTION"), fastly_key: System.fetch_env!("HEXDOCS_FASTLY_KEY"), fastly_hexdocs: System.fetch_env!("HEXDOCS_FASTLY_HEXDOCS"), queue_id: System.fetch_env!("HEXDOCS_QUEUE_ID"), diff --git a/config/test.exs b/config/test.exs index 199131d..df442fc 100644 --- a/config/test.exs +++ b/config/test.exs @@ -6,6 +6,7 @@ config :hexdocs, hexpm_impl: Hexdocs.HexpmMock, store_impl: Hexdocs.Store.Local, cdn_impl: Hexdocs.CDN.Local, + search_impl: Hexdocs.Search.Local, source_repo_impl: Hexdocs.SourceRepo.Mock config :logger, level: :warning diff --git a/lib/hexdocs/http.ex b/lib/hexdocs/http.ex index 21e3ade..db24a38 100644 --- a/lib/hexdocs/http.ex +++ b/lib/hexdocs/http.ex @@ -23,6 +23,10 @@ defmodule Hexdocs.HTTP do |> read_response() end + def post(url, headers, body, opts \\ []) do + :hackney.post(url, headers, body, opts) + end + def delete(url, headers) do :hackney.delete(url, headers) |> read_response() diff --git a/lib/hexdocs/queue.ex b/lib/hexdocs/queue.ex index fd85fbb..fea9d8f 100644 --- a/lib/hexdocs/queue.ex +++ b/lib/hexdocs/queue.ex @@ -125,8 +125,12 @@ defmodule Hexdocs.Queue do update_package_sitemap(repository, key, package, files) end + if repository == "hexpm" do + update_search_index(key, package, version, files) + end + elapsed = System.os_time(:millisecond) - start - Logger.info("FINISHED UPLOADING DOCS #{key} #{elapsed}ms") + Logger.info("FINISHED UPLOADING AND INDEXING DOCS #{key} #{elapsed}ms") {:error, reason} -> Logger.error("Failed unpack #{repository}/#{package} #{version}: #{reason}") @@ -149,6 +153,10 @@ defmodule Hexdocs.Queue do Hexdocs.Bucket.delete(repository, package, version, all_versions) update_index_sitemap(repository, key) + if repository == "hexpm" do + Hexdocs.Search.delete(package, version) + end + elapsed = System.os_time(:millisecond) - start Logger.info("FINISHED DELETING DOCS #{key} #{elapsed}ms") :ok @@ -228,6 +236,14 @@ defmodule Hexdocs.Queue do :ok end + defp update_search_index(key, package, version, files) do + with {proglang, items} <- Hexdocs.Search.find_search_items(package, version, files) do + Logger.info("UPDATING SEARCH INDEX #{key}") + Hexdocs.Search.index(package, version, proglang, items) + Logger.info("UPDATED SEARCH INDEX #{key}") + end + end + @doc false def paths_for_sitemaps() do key_regex = ~r"docs/(.*)-(.*).tar.gz$" diff --git a/lib/hexdocs/search/local.ex b/lib/hexdocs/search/local.ex new file mode 100644 index 0000000..2085c2e --- /dev/null +++ b/lib/hexdocs/search/local.ex @@ -0,0 +1,9 @@ +defmodule Hexdocs.Search.Local do + @behaviour Hexdocs.Search + + @impl true + def index(_package, _version, _proglang, _items), do: :ok + + @impl true + def delete(_package, _version), do: :ok +end diff --git a/lib/hexdocs/search/search.ex b/lib/hexdocs/search/search.ex new file mode 100644 index 0000000..b2e859f --- /dev/null +++ b/lib/hexdocs/search/search.ex @@ -0,0 +1,93 @@ +defmodule Hexdocs.Search do + require Logger + + @type package :: String.t() + @type version :: Version.t() + @type proglang :: String.t() + @type search_items :: [map] + + @callback index(package, version, proglang, search_items) :: :ok + @callback delete(package, version) :: :ok + + defp impl, do: Application.fetch_env!(:hexdocs, :search_impl) + + @spec index(package, version, proglang, search_items) :: :ok + def index(package, version, proglang, search_items) do + impl().index(package, version, proglang, search_items) + end + + @spec delete(package, version) :: :ok + def delete(package, version) do + impl().delete(package, version) + end + + @spec find_search_items(package, version, [{Path.t(), content :: iodata}]) :: + {proglang, search_items} | nil + def find_search_items(package, version, files) do + search_data_js = + Enum.find_value(files, fn {path, content} -> + case Path.basename(path) do + "search_data-" <> _digest -> content + _other -> nil + end + end) + + unless search_data_js do + Logger.info("Failed to find search data for #{package} #{version}") + end + + search_data_json = + case search_data_js do + "searchData=" <> json -> + json + + _ when is_binary(search_data_js) -> + Logger.error("Unexpected search_data format for #{package} #{version}") + nil + + nil -> + nil + end + + search_data = + if search_data_json do + try do + :json.decode(search_data_json) + catch + _kind, reason -> + Logger.error( + "Failed to decode search data json for #{package} #{version}: " <> + inspect(reason) + ) + + nil + end + end + + case search_data do + %{"items" => [_ | _] = search_items} -> + proglang = Map.get(search_data, "proglang") || proglang(search_items) + {proglang, search_items} + + nil -> + nil + + _ -> + Logger.error( + "Failed to extract search items and proglang from search data for #{package} #{version}" + ) + + nil + end + end + + defp proglang(search_items) do + if Enum.any?(search_items, &elixir_module?/1), do: "elixir", else: "erlang" + end + + defp elixir_module?(%{"type" => "module", "title" => <>}) + when first_letter in ?A..?Z, + do: true + + defp elixir_module?(_), do: false +end diff --git a/lib/hexdocs/search/typesense.ex b/lib/hexdocs/search/typesense.ex new file mode 100644 index 0000000..8c3b43e --- /dev/null +++ b/lib/hexdocs/search/typesense.ex @@ -0,0 +1,105 @@ +defmodule Hexdocs.Search.Typesense do + @moduledoc false + require Logger + alias Hexdocs.HTTP + + @behaviour Hexdocs.Search + + @impl true + def index(package, version, proglang, search_items) do + full_package = full_package(package, version) + + ndjson = + Enum.map(search_items, fn item -> + json = + Map.take(item, ["type", "ref", "title", "doc"]) + |> Map.put("package", full_package) + |> Map.put("proglang", proglang) + |> :json.encode() + + [json, ?\n] + end) + + url = url("collections/#{collection()}/documents/import?action=create") + headers = [{"x-typesense-api-key", api_key()}] + + case HTTP.post(url, headers, ndjson, [:with_body]) do + {:ok, 200, _resp_headers, ndjson} -> + ndjson + |> String.split("\n") + |> Enum.each(fn json -> + case :json.decode(json) do + %{"success" => true} -> + :ok + + %{"success" => false, "error" => error, "document" => document} -> + Logger.error( + "Failed to index search item for #{package} #{version} for document #{inspect(document)}: #{inspect(error)}" + ) + end + end) + + {:ok, status, _resp_headers, _body} -> + Logger.error("Failed to index search items for #{package} #{version}: status=#{status}") + + {:error, reason} -> + Logger.error("Failed to index search items #{package} #{version}: #{inspect(reason)}") + end + end + + @impl true + def delete(package, version) do + full_package = full_package(package, version) + + query = URI.encode_query([{"filter_by", "package:#{full_package}"}]) + url = url("collections/#{collection()}/documents?" <> query) + headers = [{"x-typesense-api-key", api_key()}] + + case HTTP.delete(url, headers) do + {:ok, 200, _resp_headers, _body} -> + :ok + + {:ok, status, _resp_headers, _body} -> + Logger.error("Failed to delete search items for #{package} #{version}: status=#{status}") + + {:error, reason} -> + Logger.error( + "Failed to delete search items for #{package} #{version}: #{inspect(reason)}" + ) + end + end + + @spec collection :: String.t() + def collection do + Application.fetch_env!(:hexdocs, :typesense_collection) + end + + @spec collection_schema :: map + def collection_schema(collection \\ collection()) do + %{ + "fields" => [ + %{"facet" => true, "name" => "proglang", "type" => "string"}, + %{"facet" => true, "name" => "type", "type" => "string"}, + %{"name" => "title", "type" => "string"}, + %{"name" => "doc", "type" => "string"}, + %{"facet" => true, "name" => "package", "type" => "string"} + ], + "name" => collection, + "token_separators" => [".", "_", "-", " ", ":", "@", "/"] + } + end + + @spec api_key :: String.t() + def api_key do + Application.fetch_env!(:hexdocs, :typesense_api_key) + end + + defp full_package(package, version) do + "#{package}-#{version}" + end + + defp url(path) do + base_url = Application.fetch_env!(:hexdocs, :typesense_url) + Path.join(base_url, path) + end +end diff --git a/test/hexdocs/search_test.exs b/test/hexdocs/search_test.exs new file mode 100644 index 0000000..ab778ea --- /dev/null +++ b/test/hexdocs/search_test.exs @@ -0,0 +1,290 @@ +defmodule Hexdocs.SearchTest do + use ExUnit.Case + import ExUnit.CaptureLog + alias Hexdocs.Search.Typesense + + @moduletag :typesense + + setup %{test: test} do + Mox.set_mox_global() + + Hexdocs.HexpmMock + |> Mox.stub(:hexdocs_sitemap, fn -> "this is the sitemap" end) + |> Mox.stub(:get_package, fn _repo, _package -> %{"releases" => []} end) + + orignal_search_impl = Application.get_env(:hexdocs, :search_impl) + on_exit(fn -> Application.put_env(:hexdocs, :search_impl, orignal_search_impl) end) + Application.put_env(:hexdocs, :search_impl, Typesense) + + typesense_new_collection() + + {:ok, package: test} + end + + defp run_upload(package, version, files) do + tar = Hexdocs.Tar.create(files) + key = "docs/#{package}-#{version}.tar.gz" + Hexdocs.Store.put!(:repo_bucket, key, tar) + ref = Broadway.test_message(Hexdocs.Queue, queue_put_message(key)) + assert_receive {:ack, ^ref, [_], []} + end + + defp run_delete(package, version) do + key = "docs/#{package}-#{version}.tar.gz" + ref = Broadway.test_message(Hexdocs.Queue, queue_delete_message(key)) + assert_receive {:ack, ^ref, [_], []} + end + + test "happy path: indexes public search_data on upload and deindexes it on delete", %{ + package: package + } do + version = "1.0.0" + + run_upload(package, version, [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", + """ + searchData={"items":[\ + {"type":"function","title":"Example.test/4","doc":"does example things","ref":"Example.html#test/4"},\ + {"type":"module","title":"Example","doc":"example text","ref":"Example.html"}\ + ],"content_type":"text/markdown","producer":{"name":"ex_doc","version":[48,46,51,52,46,50]}}\ + """} + ]) + + full_package = "#{package}-#{version}" + + assert [ + %{ + "document" => %{ + "doc" => "example text", + "package" => ^full_package, + "proglang" => "elixir", + "ref" => "Example.html", + "title" => "Example", + "type" => "module" + } + }, + %{ + "document" => %{ + "doc" => "does example things", + "package" => ^full_package, + "proglang" => "elixir", + "ref" => "Example.html#test/4", + "title" => "Example.test/4", + "type" => "function" + } + } + ] = typesense_search(%{"q" => "example", "query_by" => "title"}) + + assert [ + %{ + "document" => %{ + "doc" => "does example things", + "proglang" => "elixir", + "package" => ^full_package, + "ref" => "Example.html#test/4", + "title" => "Example.test/4", + "type" => "function" + } + } + ] = typesense_search(%{"q" => "thing", "query_by" => "doc"}) + + run_delete(package, version) + + assert typesense_search(%{"q" => "example", "query_by" => "title"}) == [] + end + + test "extracts proglang from search_data if available", %{package: package} do + run_upload(package, "1.0.0", [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", + """ + searchData={"items":[{"type":"module","title":"Example","doc":"example text","ref":"Example.html"}],\ + "content_type":"text/markdown","producer":{"name":"ex_doc","version":[48,46,51,52,46,50]},\ + "proglang":"erlang"}\ + """} + ]) + + assert [%{"document" => %{"title" => "Example", "proglang" => "erlang"}}] = + typesense_search(%{ + "q" => "example", + "query_by" => "title", + "filter" => "proglang:erlang" + }) + end + + test "logs an info message if search_data is not found", %{package: package} do + original_log_level = Logger.level() + Logger.configure(level: :info) + on_exit(fn -> Logger.configure(level: original_log_level) end) + + log = + capture_log(fn -> + run_upload(package, "1.0.0", [{"index.html", "contents"}]) + end) + + assert log =~ "[info] Failed to find search data for #{package} 1.0.0" + assert typesense_search(%{"q" => package, "query_by" => "package"}) == [] + end + + test "logs an error message if search_data.js file has unexpected format", %{package: package} do + files = [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", "unexpected format"} + ] + + log = capture_log(fn -> run_upload(package, "1.0.0", files) end) + assert log =~ "[error] Unexpected search_data format for #{package} 1.0.0" + assert typesense_search(%{"q" => package, "query_by" => "package"}) == [] + end + + test "logs an error message if search_data.json cannot be decoded", %{package: package} do + files = [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", "searchData={\"items\":["} + ] + + log = capture_log(fn -> run_upload(package, "1.0.0", files) end) + + assert log =~ + "[error] Failed to decode search data json for #{package} 1.0.0: :unexpected_end" + + assert typesense_search(%{"q" => package, "query_by" => "package"}) == [] + end + + test "logs an error message if search_data has empty items", %{package: package} do + files = [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", "searchData={\"items\":[]}"} + ] + + log = capture_log(fn -> run_upload(package, "1.0.0", files) end) + + assert log =~ + "[error] Failed to extract search items and proglang from search data for #{package} 1.0.0" + + assert typesense_search(%{"q" => package, "query_by" => "package"}) == [] + end + + test "logs an error message if search_data has no items", %{package: package} do + files = [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", "searchData={\"not_items\":[]}"} + ] + + log = capture_log(fn -> run_upload(package, "1.0.0", files) end) + + assert log =~ + "[error] Failed to extract search items and proglang from search data for #{package} 1.0.0" + + assert typesense_search(%{"q" => package, "query_by" => "package"}) == [] + end + + test "logs errors when indexing incomplete search items", %{package: package} do + files = [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", + """ + searchData={"items":[\ + {"type":"whatever"},\ + {"type":"function","title":"Example.test/4","doc":"does example things","ref":"Example.html#test/4"},\ + {"type":"module","title":"Example","doc":"example text","ref":"Example.html"}\ + ],"content_type":"text/markdown","producer":{"name":"ex_doc","version":[48,46,51,52,46,50]},\ + "proglang":"elixir"}\ + """} + ] + + log = capture_log(fn -> run_upload(package, "1.0.0", files) end) + + assert log =~ "[error] Failed to index search item for #{package} 1.0.0 for document " + assert log =~ "Field `doc` has been declared in the schema, but is not found in the document." + + # the valid documents should still be indexed + assert [_, _] = + typesense_search(%{ + "q" => "example", + "query_by" => "title", + "filter" => "proglang:elixir" + }) + end + + test "logs errors when indexing invalid search items", %{package: package} do + files = + [ + {"index.html", "contents"}, + {"dist/search_data-0F918FFD.js", + """ + searchData={"items":[\ + {"type":["function"],"title":"Example.test/4","doc":"does example things","ref":["Example.html#test/4"]},\ + {"type":"function","title":"Example.test/4","doc":"does example things","ref":"Example.html#test/4"},\ + {"type":{"a":"module"},"title":"Example","doc":{"content":"example text"},"ref":"Example.html"},\ + {"type":"module","title":"Example","doc":"example text","ref":"Example.html"}\ + ],"content_type":"text/markdown","producer":{"name":"ex_doc","version":[48,46,51,52,46,50]},\ + "proglang":"elixir"}\ + """} + ] + + log = capture_log(fn -> run_upload(package, "1.0.0", files) end) + + assert log =~ "[error] Failed to index search item for #{package} 1.0.0 for document " + assert log =~ "Field `type` must be a string." + assert log =~ "Field `doc` must be a string." + + # the valid documents should still be indexed + assert [_, _] = + typesense_search(%{ + "q" => "example", + "query_by" => "title", + "filter" => "proglang:elixir" + }) + end + + defp queue_put_message(key) do + Jason.encode!(%{ + "Records" => [ + %{ + "eventName" => "ObjectCreated:Put", + "s3" => %{"object" => %{"key" => key}} + } + ] + }) + end + + defp queue_delete_message(key) do + Jason.encode!(%{ + "Records" => [ + %{ + "eventName" => "ObjectRemoved:Delete", + "s3" => %{"object" => %{"key" => key}} + } + ] + }) + end + + defp typesense_new_collection do + collection = Typesense.collection() + api_key = Typesense.api_key() + headers = [{"x-typesense-api-key", api_key}, {"content-type", "application/json"}] + payload = Jason.encode_to_iodata!(Typesense.collection_schema(collection)) + + assert {:ok, 201, _resp_headers, _ref} = + :hackney.post("http://localhost:8108/collections", headers, payload) + + on_exit(fn -> :hackney.delete("http://localhost:8108/collections/#{collection}", headers) end) + end + + defp typesense_search(query) do + collection = Typesense.collection() + api_key = Typesense.api_key() + + url = + "http://localhost:8108/collections/#{collection}/documents/search?" <> + URI.encode_query(query) + + headers = [{"x-typesense-api-key", api_key}] + assert {:ok, 200, _resp_headers, ref} = :hackney.get(url, headers) + assert {:ok, body} = :hackney.body(ref) + assert %{"hits" => hits} = Jason.decode!(body) + hits + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs index f82f1d3..caea947 100644 --- a/test/test_helper.exs +++ b/test/test_helper.exs @@ -3,4 +3,21 @@ File.mkdir("tmp") Mox.defmock(Hexdocs.HexpmMock, for: Hexdocs.Hexpm) Mox.defmock(Hexdocs.SourceRepo.Mock, for: Hexdocs.SourceRepo) -ExUnit.start() + +if :typesense in ExUnit.configuration()[:include] do + typesense_available? = + case Hexdocs.HTTP.get("http://localhost:8108/health", _req_headers = []) do + {:ok, 200, _resp_headers, ~s|{"ok":true}|} -> true + {:error, :econnrefused} -> false + end + + unless typesense_available? do + Mix.shell().error(""" + To enable Typesense tests, start the local container with the following command: + + docker compose up -d typesense + """) + end +end + +ExUnit.start(exclude: [:typesense])