Skip to content

Commit f560e8f

Browse files
authored
Update tokenizers to 0.15.0 (#55)
* Update tokenizers to 0.15.0 * Fix clippy complaints * Update flake * Expose byte fallback for unigram
1 parent a8a7464 commit f560e8f

File tree

11 files changed

+205
-176
lines changed

11 files changed

+205
-176
lines changed

flake.lock

Lines changed: 63 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

flake.nix

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,52 +2,63 @@
22
description = "Tokenizers";
33

44
inputs = {
5+
fenix = {
6+
url = "github:nix-community/fenix";
7+
inputs.nixpkgs.follows = "nixpkgs";
8+
};
59
nixpkgs.url = "nixpkgs/nixos-unstable";
610
flake-utils.url = "github:numtide/flake-utils";
711
};
812

9-
outputs = { self, nixpkgs, flake-utils }:
13+
outputs = {
14+
self,
15+
nixpkgs,
16+
flake-utils,
17+
fenix,
18+
}:
1019
flake-utils.lib.eachSystem [
1120
flake-utils.lib.system.x86_64-linux
1221
flake-utils.lib.system.x86_64-darwin
1322
flake-utils.lib.system.aarch64-darwin
1423
flake-utils.lib.system.aarch64-linux
1524
]
16-
(system:
17-
let pkgs = import nixpkgs { inherit system; };
18-
in
19-
{
20-
devShell = pkgs.mkShell {
21-
buildInputs = with pkgs; [
22-
act
23-
binutils
24-
cargo
25-
cc
26-
clang
27-
clippy
28-
elixir_1_14
29-
erlang
30-
gdb
31-
gcc
32-
libiconv
33-
openssl
34-
pkg-config
35-
rustc
36-
] ++ lib.optionals stdenv.isDarwin [
37-
darwin.apple_sdk.frameworks.Foundation
38-
darwin.apple_sdk.frameworks.Carbon
39-
darwin.apple_sdk.frameworks.AppKit
40-
];
41-
shellHook = ''
42-
mkdir -p .nix-mix
43-
mkdir -p .nix-hex
44-
export MIX_HOME=$PWD/.nix-mix
45-
export HEX_HOME=$PWD/.nix-hex
46-
export PATH=$MIX_HOME/bin:$PATH
47-
export PATH=$HEX_HOME/bin:$PATH
48-
export PATH=$MIX_HOME/escripts:$PATH
49-
export ERL_AFLAGS="-kernel shell_history enabled"
50-
'';
51-
};
52-
});
25+
(system: let
26+
pkgs = import nixpkgs {inherit system;};
27+
in {
28+
devShell = pkgs.mkShell {
29+
buildInputs = with pkgs;
30+
[
31+
act
32+
binutils
33+
clang
34+
elixir_1_15
35+
(fenix.packages."${system}".complete.withComponents [
36+
"cargo"
37+
"clippy"
38+
"rust-src"
39+
"rustc"
40+
"rustfmt"
41+
])
42+
gcc
43+
libiconv
44+
openssl
45+
pkg-config
46+
]
47+
++ lib.optionals stdenv.isDarwin [
48+
darwin.apple_sdk.frameworks.Foundation
49+
darwin.apple_sdk.frameworks.Carbon
50+
darwin.apple_sdk.frameworks.AppKit
51+
];
52+
shellHook = ''
53+
mkdir -p .nix-mix
54+
mkdir -p .nix-hex
55+
export MIX_HOME=$PWD/.nix-mix
56+
export HEX_HOME=$PWD/.nix-hex
57+
export PATH=$MIX_HOME/bin:$PATH
58+
export PATH=$HEX_HOME/bin:$PATH
59+
export PATH=$MIX_HOME/escripts:$PATH
60+
export ERL_AFLAGS="-kernel shell_history enabled"
61+
'';
62+
};
63+
});
5364
}

lib/tokenizers/model/bpe.ex

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ defmodule Tokenizers.Model.BPE do
22
@typedoc """
33
Options for model initialisation.
44
5+
* `:byte_fallback`- whether to use the byte fallback trick
6+
57
* `:cache_capacity` - the number of words that the BPE cache can
68
contain. The cache allows to speed-up the process by keeping
79
the result of the merge operations for a number of words.

lib/tokenizers/model/unigram.ex

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ defmodule Tokenizers.Model.Unigram do
22
@typedoc """
33
Options for model initialisation.
44
5+
* `:byte_fallback`- whether to use the byte fallback trick
56
* `:unk_id`- the unknown token id to be used by the model
67
78
"""
89
@type options() :: [
10+
byte_fallback: boolean(),
911
unk_id: float()
1012
]
1113

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ defmodule Tokenizers.MixProject do
22
use Mix.Project
33

44
@source_url "https://github.com/elixir-nx/tokenizers"
5-
@version "0.4.0"
5+
@version "0.5.0-dev"
66

77
def project do
88
[

0 commit comments

Comments
 (0)