Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion optillm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Version information
__version__ = "0.2.8"
__version__ = "0.2.9"

# Import from server module
from .server import (
Expand Down
9 changes: 6 additions & 3 deletions optillm/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ def _load_model():
logger.info(f"Using device: {device}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=os.getenv("HF_TOKEN"))

# Base kwargs for model loading
model_kwargs = {
Expand Down Expand Up @@ -1076,6 +1076,7 @@ def _load_model():
try:
model = AutoModelForCausalLM.from_pretrained(
model_id,
token=os.getenv("HF_TOKEN"),
**model_kwargs
)
except Exception as e:
Expand All @@ -1085,6 +1086,7 @@ def _load_model():
model_kwargs.pop("attn_implementation")
model = AutoModelForCausalLM.from_pretrained(
model_id,
token=os.getenv("HF_TOKEN"),
**model_kwargs
)
elif model_kwargs["torch_dtype"] == torch.float16:
Expand All @@ -1094,6 +1096,7 @@ def _load_model():
model_kwargs["torch_dtype"] = torch.float32
model = AutoModelForCausalLM.from_pretrained(
model_id,
token=os.getenv("HF_TOKEN"),
**model_kwargs
)

Expand Down Expand Up @@ -1134,7 +1137,7 @@ def validate_adapter(self, adapter_id: str) -> bool:
config = PeftConfig.from_pretrained(
adapter_id,
trust_remote_code=True,
use_auth_token=os.getenv("HF_TOKEN")
token=os.getenv("HF_TOKEN")
)
return True
except Exception as e:
Expand All @@ -1159,7 +1162,7 @@ def _load_adapter():
config = PeftConfig.from_pretrained(
adapter_id,
trust_remote_code=True,
use_auth_token=os.getenv("HF_TOKEN")
token=os.getenv("HF_TOKEN")
)

model = base_model
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "optillm"
version = "0.2.8"
version = "0.2.9"
description = "An optimizing inference proxy for LLMs."
readme = "README.md"
license = "Apache-2.0"
Expand Down