snowflakedb · sfc-gh-mhidayetoglu · Sep 16, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 23, 2025
@@ -1 +1 @@
-* @sfc-gh-aqiao @sfc-gh-jrasley @sfc-gh-mhidayetoglu @sfc-gh-yewang @sfc-gh-goliaro
+* @sfc-gh-aqiao @sfc-gh-jrasley @sfc-gh-mhidayetoglu @sfc-gh-yewang @sfc-gh-goliaro @sfc-gh-reyazda
@@ -36,10 +36,10 @@ Arctic Inference achieves high throughput and low latency through a wholistic se
     <tbody>
         <tr>
             <td align="left">
-                Arctic Ulysses (<a href="https://www.snowflake.com/en/engineering-blog/ulysses-low-latency-llm-inference/">blog</a>,
-                                <a href="https://arxiv.org/abs/2507.11830">paper</a>)
+                Arctic Ulysses (<a href="https://www.snowflake.com/en/engineering-blog/ulysses-low-latency-llm-inference/">blog</a>)
                 <br>
-                Shift Parallelism (<a href="https://www.snowflake.com/en/engineering-blog/arctic-inference-shift-parallelism/">blog</a>)
+                Shift Parallelism (<a href="https://www.snowflake.com/en/engineering-blog/arctic-inference-shift-parallelism/">blog</a>,
+                                   <a href="https://arxiv.org/abs/2509.16495">paper</a>)
             </td>
             <td align="left">
                 Arctic Speculator (<a href="https://www.snowflake.com/en/engineering-blog/fast-speculative-decoding-vllm-arctic/">blog</a>)
@@ -105,7 +105,7 @@ By using the examples below, you can get benefits from Shift Parallelism, Specul
 #### Serving
 
 ```console
-vllm serve Snowflake/Llama-3.1-SwiftKV-8B-Instruct \
+ARCTIC_INFERENCE_ENABLED=1 vllm serve Snowflake/Llama-3.1-SwiftKV-8B-Instruct \
     --quantization "fp8" \
     --tensor-parallel-size 1 \
     --ulysses-sequence-parallel-size 2 \
@@ -121,6 +121,8 @@ vllm serve Snowflake/Llama-3.1-SwiftKV-8B-Instruct \
 
 #### Offline
 
+Save the following script to `arctic_example.py`:
+
 ```python
 import vllm
 from vllm import LLM, SamplingParams
@@ -156,6 +158,12 @@ outputs = llm.chat(conversation, sampling_params=sampling_params)
 print(outputs[0].outputs[0].text)
 ```
 
+Run the script with Arctic Inference enabled:
+
+```console
+ARCTIC_INFERENCE_ENABLED=1 python arctic_example.py
+```
+
 ## Citation
 ```
 @misc{arcticinference2025,

@@ -20,10 +20,18 @@
     ARCTIC_INFERENCE_SKIP_SPEC_MODEL_CHECK: bool = False
 
 environment_variables: dict[str, Callable[[], Any]] = {
+    "ARCTIC_INFERENCE_ENABLED":
+    lambda: os.getenv("ARCTIC_INFERENCE_ENABLED", "0") == "1",
+    "ARCTIC_INFERENCE_SKIP_PLATFORM_CHECK":
+    lambda: os.getenv("ARCTIC_INFERENCE_SKIP_PLATFORM_CHECK", "0") == "1",
     "ARCTIC_INFERENCE_SKIP_SPEC_MODEL_CHECK":
     lambda: os.getenv("ARCTIC_INFERENCE_SKIP_SPEC_MODEL_CHECK", "0") == "1",
+    "ARCTIC_INFERENCE_SKIP_VERSION_CHECK":
+    lambda: os.getenv("ARCTIC_INFERENCE_SKIP_VERSION_CHECK", "0") == "1",
 }
 
+# temporary workaround for gpt-oss model
+ARCTIC_INFERENCE_SKIP_SPEC_MODEL_CHECK = 1
 
 def __getattr__(name: str) -> Any:
     if name in environment_variables:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		* @sfc-gh-aqiao @sfc-gh-jrasley @sfc-gh-mhidayetoglu @sfc-gh-yewang @sfc-gh-goliaro
		* @sfc-gh-aqiao @sfc-gh-jrasley @sfc-gh-mhidayetoglu @sfc-gh-yewang @sfc-gh-goliaro @sfc-gh-reyazda