Skip to content

Commit 5eb444e

Browse files
polvalentejosevalimjonatanklosko
authored
fix: make exla build resilient to stale upgrades (#1548)
Co-authored-by: José Valim <[email protected]> Co-authored-by: Jonatan Kłosko <[email protected]>
1 parent c82702b commit 5eb444e

File tree

5 files changed

+86
-14
lines changed

5 files changed

+86
-14
lines changed

exla/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ XLA_EXTENSION_LIB = $(XLA_EXTENSION_DIR)/lib
88
XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include
99

1010
# Cache configuration
11-
EXLA_CACHE_SO = cache/libexla.so
12-
EXLA_CACHE_OBJ_DIR = cache/objs
11+
EXLA_CACHE_SO = cache/$(EXLA_VERSION)/libexla.so
12+
EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs
1313

1414
# Private configuration
1515
EXLA_DIR = c_src/exla

exla/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,26 @@ EXLA relies on the [XLA](https://github.com/elixir-nx/xla) package to provide th
4848

4949
For cross-compilation, you need to [set your `XLA_TARGET_PLATFORM` variable](https://github.com/elixir-nx/xla#xla_target_platform) to the correct target platform value (i.e. `aarch64-linux-gnu` for the Raspberry Pi 4).
5050

51+
## Troubleshooting
52+
53+
EXLA uses NIFs (C-interface code called from Elixir) for part of its functionality.
54+
If for any reason these fail to compile or load, troubleshooting can be tricky.
55+
56+
We recommend following the steps below:
57+
58+
1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting `EXLA_FORCE_REBUILD` to clean up cached files:
59+
* `EXLA_FORCE_REBUILD=partial`: Removes the only the libexla.so caches (both local and global ones).
60+
* `EXLA_FORCE_REBUILD=true`: Removes the libexla.so caches but also removes the intermediate `.o` compilation artifacts retained from previous builds.
61+
62+
Additional notes on compilation:
63+
* Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important.
64+
* Remember to save the compilation logs from this step for further debugging.
65+
* It is a good idea to save the `cache/<version>/libexla.so` file so that the team can inspect its contents if needed.
66+
2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up.
67+
This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub
68+
so that the Nx team can investigate further.
69+
70+
5171
## Contributing
5272

5373
### Building locally

exla/lib/exla/nif.ex

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,20 @@ defmodule EXLA.NIF do
44

55
def __on_load__ do
66
path = :filename.join(:code.priv_dir(:exla), ~c"libexla")
7-
:erlang.load_nif(path, 0)
7+
8+
case :erlang.load_nif(path, 0) do
9+
:ok ->
10+
:ok
11+
12+
{:error, {reason, text}} ->
13+
raise """
14+
Failed to load NIF library.
15+
Follow the steps in the :exla README Troubleshooting section for more information.
16+
17+
#{inspect(reason)}
18+
#{text}
19+
"""
20+
end
821
end
922

1023
def mlir_new_thread_pool(_concurrency), do: :erlang.nif_error(:undef)

exla/mix.exs

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ defmodule EXLA.MixProject do
3535

3636
%{
3737
"MIX_BUILD_EMBEDDED" => "#{Mix.Project.config()[:build_embedded]}",
38-
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv
38+
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv,
39+
"EXLA_VERSION" => "#{@version}"
3940
}
4041
end,
4142
make_args: make_args
@@ -133,7 +134,38 @@ defmodule EXLA.MixProject do
133134
{:ok, []}
134135
end
135136

136-
defp cached_make(_) do
137+
defp cached_make(args) do
138+
force_rebuild_mode =
139+
case System.get_env("EXLA_FORCE_REBUILD", "") do
140+
"" ->
141+
:none
142+
143+
"0" ->
144+
:none
145+
146+
"partial" ->
147+
:partial
148+
149+
"true" ->
150+
:full
151+
152+
"1" ->
153+
:full
154+
155+
value ->
156+
Mix.raise(
157+
"invalid value for EXLA_FORCE_REBUILD: '#{value}'. Expected one of: partial, true"
158+
)
159+
end
160+
161+
File.mkdir_p!("cache/#{@version}")
162+
163+
# remove only in full mode
164+
if force_rebuild_mode in [:partial, :full] do
165+
Mix.shell().info("Removing cached .o files in cache/#{@version}/objs")
166+
File.rm_rf!("cache/#{@version}/objs")
167+
end
168+
137169
contents =
138170
for path <- Path.wildcard("c_src/**/*"),
139171
{:ok, contents} <- [File.read(path)],
@@ -148,19 +180,27 @@ defmodule EXLA.MixProject do
148180
"elixir-#{System.version()}-erts-#{:erlang.system_info(:version)}-xla-#{Application.spec(:xla, :vsn)}-exla-#{@version}-#{md5}"
149181

150182
cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"])
151-
cached? = File.exists?(cached_so)
183+
cached? = File.exists?(cached_so) and force_rebuild_mode == :none
184+
185+
if force_rebuild_mode in [:partial, :full] do
186+
Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so")
187+
File.rm_rf!("cache/#{@version}/libexla.so")
188+
189+
Mix.shell().info("Removing libexla.so cache at #{cached_so}")
190+
File.rm!(cached_so)
191+
end
152192

153193
if cached? do
154194
Mix.shell().info("Using libexla.so from #{cached_so}")
155-
File.cp!(cached_so, "cache/libexla.so")
195+
File.cp!(cached_so, "cache/#{@version}/libexla.so")
156196
end
157197

158-
result = Mix.Tasks.Compile.ElixirMake.run([])
198+
result = Mix.Tasks.Compile.ElixirMake.run(args)
159199

160200
if not cached? and match?({:ok, _}, result) do
161201
Mix.shell().info("Caching libexla.so at #{cached_so}")
162202
File.mkdir_p!(Path.dirname(cached_so))
163-
File.cp!("cache/libexla.so", cached_so)
203+
File.cp!("cache/#{@version}/libexla.so", cached_so)
164204
end
165205

166206
result

exla/test/exla/device_memory_sharing_test.exs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,13 @@ defmodule EXLA.DeviceMemorySharingTest do
2727
end
2828

2929
@tag :cuda_required
30-
test "ipc handles don't crash the runtime when :local mode is selected" do
31-
assert {:error, ~c"Invalid pointer size for selected mode."} ==
30+
test "invalid ipc handles don't crash the runtime" do
31+
assert {:error, ~c"Unable to get pointer for IPC handle."} ==
3232
Nx.from_pointer(
3333
{EXLA.Backend, client: :cuda},
34-
Enum.to_list(0..63),
34+
%Nx.Pointer{handle: "#{System.unique_integer()}", kind: :ipc, data_size: 4},
3535
{:f, 32},
36-
{1},
37-
mode: :local
36+
{1}
3837
)
3938
end
4039
end

0 commit comments

Comments
 (0)