44import platform
55import unittest
66
7- import numpy as np
87import pytest
98import torch
109from parameterized import parameterized
11- from transformers import (
12- AutoModelForCausalLM ,
13- AutoTokenizer ,
14- GenerationConfig ,
15- PretrainedConfig ,
16- pipeline ,
17- set_seed ,
18- )
10+ from transformers import AutoModelForCausalLM , AutoTokenizer , GenerationConfig , PretrainedConfig , pipeline , set_seed
1911from transformers .testing_utils import slow
20- from utils_tests import MODEL_NAMES , get_num_sdpa , mock_torch_cuda_is_available , patch_awq_for_inference
12+ from utils_tests import (
13+ F32_CONFIG ,
14+ MODEL_NAMES ,
15+ OPENVINO_DEVICE ,
16+ SEED ,
17+ get_num_sdpa ,
18+ mock_torch_cuda_is_available ,
19+ patch_awq_for_inference ,
20+ )
2121
2222from optimum .exporters .openvino .model_patcher import patch_update_causal_mask
2323from optimum .intel import OVModelForCausalLM , OVModelForSequenceClassification
2929if is_transformers_version (">=" , "4.55" ):
3030 from transformers import Mxfp4Config
3131
32- SEED = 42
33- F32_CONFIG = {"INFERENCE_PRECISION_HINT" : "f32" }
34- TENSOR_ALIAS_TO_TYPE = {"pt" : torch .Tensor , "np" : np .ndarray }
35-
3632
3733class OVModelForCausalLMIntegrationTest (unittest .TestCase ):
3834 SUPPORTED_ARCHITECTURES = (
@@ -240,7 +236,9 @@ def test_compare_to_transformers(self, model_arch):
240236 if model_arch == "gemma2" :
241237 model_kwargs ["attn_implementation" ] = "sdpa"
242238
243- ov_model = OVModelForCausalLM .from_pretrained (model_id , export = True , ov_config = F32_CONFIG , ** model_kwargs )
239+ ov_model = OVModelForCausalLM .from_pretrained (
240+ model_id , export = True , ov_config = F32_CONFIG , device = OPENVINO_DEVICE , ** model_kwargs
241+ )
244242 self .assertIsInstance (ov_model .config , PretrainedConfig )
245243 self .assertTrue (ov_model .use_cache )
246244 tokenizer = AutoTokenizer .from_pretrained (model_id , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS )
@@ -388,7 +386,9 @@ def test_pipeline(self, model_arch):
388386 additional_args ["use_model_defaults" ] = False
389387
390388 set_seed (SEED )
391- model = OVModelForCausalLM .from_pretrained (model_id , use_cache = True , compile = False , ** model_kwargs )
389+ model = OVModelForCausalLM .from_pretrained (
390+ model_id , use_cache = True , compile = False , device = OPENVINO_DEVICE , ** model_kwargs
391+ )
392392 model .eval ()
393393 model .config .encoder_no_repeat_ngram_size = 0
394394 model .to ("cpu" )
@@ -428,7 +428,7 @@ def test_pipeline(self, model_arch):
428428
429429 def test_model_and_decoder_same_device (self ):
430430 model_id = MODEL_NAMES ["gpt2" ]
431- model = OVModelForCausalLM .from_pretrained (model_id , export = True )
431+ model = OVModelForCausalLM .from_pretrained (model_id , export = True , device = OPENVINO_DEVICE )
432432 model .to ("TEST" )
433433 self .assertEqual (model ._device , "TEST" )
434434 # Verify that request is being reset
@@ -441,13 +441,17 @@ def test_compare_with_and_without_past_key_values(self):
441441 tokenizer = AutoTokenizer .from_pretrained (model_id )
442442 tokens = tokenizer ("This is a sample input" , return_tensors = "pt" )
443443
444- model_with_pkv = OVModelForCausalLM .from_pretrained (model_id , export = True , use_cache = True , stateful = False )
444+ model_with_pkv = OVModelForCausalLM .from_pretrained (
445+ model_id , export = True , use_cache = True , stateful = False , device = OPENVINO_DEVICE
446+ )
445447 outputs_model_with_pkv = model_with_pkv .generate (
446448 ** tokens , min_length = self .GENERATION_LENGTH , max_length = self .GENERATION_LENGTH , num_beams = 1
447449 )
448450 del model_with_pkv
449451
450- model_without_pkv = OVModelForCausalLM .from_pretrained (model_id , export = True , use_cache = False )
452+ model_without_pkv = OVModelForCausalLM .from_pretrained (
453+ model_id , export = True , use_cache = False , device = OPENVINO_DEVICE
454+ )
451455 outputs_model_without_pkv = model_without_pkv .generate (
452456 ** tokens , min_length = self .GENERATION_LENGTH , max_length = self .GENERATION_LENGTH , num_beams = 1
453457 )
@@ -457,7 +461,9 @@ def test_compare_with_and_without_past_key_values(self):
457461 self .assertEqual (outputs_model_with_pkv .shape [1 ], self .GENERATION_LENGTH )
458462 self .assertEqual (outputs_model_without_pkv .shape [1 ], self .GENERATION_LENGTH )
459463
460- model_stateful = OVModelForCausalLM .from_pretrained (model_id , export = True , use_cache = True , stateful = True )
464+ model_stateful = OVModelForCausalLM .from_pretrained (
465+ model_id , export = True , use_cache = True , stateful = True , device = OPENVINO_DEVICE
466+ )
461467 outputs_model_stateful = model_stateful .generate (
462468 ** tokens , min_length = self .GENERATION_LENGTH , max_length = self .GENERATION_LENGTH , num_beams = 1
463469 )
@@ -475,7 +481,9 @@ def test_print_model_properties(self):
475481 # test setting OPENVINO_LOG_LEVEL to 3, which calls _print_compiled_model_properties
476482 openvino_log_level = os .environ .get ("OPENVINO_LOG_LEVEL" , None )
477483 os .environ ["OPENVINO_LOG_LEVEL" ] = "3"
478- model = OVModelForSequenceClassification .from_pretrained (MODEL_NAMES ["bert" ], export = True )
484+ model = OVModelForSequenceClassification .from_pretrained (
485+ MODEL_NAMES ["bert" ], export = True , device = OPENVINO_DEVICE
486+ )
479487 if openvino_log_level is not None :
480488 os .environ ["OPENVINO_LOG_LEVEL" ] = openvino_log_level
481489 # test calling function directly
@@ -496,7 +504,9 @@ def test_auto_device_loading(self):
496504
497505 def test_default_filling_attention_mask (self ):
498506 model_id = MODEL_NAMES ["gpt2" ]
499- model_with_cache = OVModelForCausalLM .from_pretrained (model_id , stateful = False , use_cache = True )
507+ model_with_cache = OVModelForCausalLM .from_pretrained (
508+ model_id , stateful = False , use_cache = True , device = OPENVINO_DEVICE
509+ )
500510 tokenizer = AutoTokenizer .from_pretrained (model_id )
501511 tokenizer .pad_token = tokenizer .eos_token
502512 texts = ["this is a simple input" ]
@@ -519,7 +529,9 @@ def test_default_filling_attention_mask(self):
519529
520530 def test_default_filling_attention_mask_and_position_ids (self ):
521531 model_id = MODEL_NAMES ["llama" ]
522- model_with_cache = OVModelForCausalLM .from_pretrained (model_id , stateful = False , use_cache = True )
532+ model_with_cache = OVModelForCausalLM .from_pretrained (
533+ model_id , stateful = False , use_cache = True , device = OPENVINO_DEVICE
534+ )
523535 tokenizer = AutoTokenizer .from_pretrained (model_id )
524536 tokenizer .pad_token = tokenizer .eos_token
525537 texts = ["this is a simple input" ]
@@ -613,11 +625,11 @@ def test_beam_search(self, model_arch):
613625 ]
614626 set_seed (SEED )
615627 ov_model_stateful = OVModelForCausalLM .from_pretrained (
616- model_id , export = True , use_cache = True , stateful = True , ** model_kwargs
628+ model_id , export = True , use_cache = True , stateful = True , device = OPENVINO_DEVICE , ** model_kwargs
617629 )
618630 set_seed (SEED )
619631 ov_model_stateless = OVModelForCausalLM .from_pretrained (
620- model_id , export = True , use_cache = True , stateful = False , ** model_kwargs
632+ model_id , export = True , use_cache = True , stateful = False , device = OPENVINO_DEVICE , ** model_kwargs
621633 )
622634 if "awq" in model_arch or "gptq" in model_arch :
623635 # infer in FP32
@@ -728,7 +740,9 @@ def test_load_with_different_dtype(self):
728740 torch_dtypes .append ("bfloat16" )
729741
730742 for dtype in torch_dtypes :
731- ov_model = OVModelForCausalLM .from_pretrained (model_id = model_id , export = True , torch_dtype = dtype )
743+ ov_model = OVModelForCausalLM .from_pretrained (
744+ model_id = model_id , export = True , torch_dtype = dtype , device = OPENVINO_DEVICE
745+ )
732746 ov_logits = ov_model (** test_input ).logits
733747 self .assertTrue (
734748 torch .allclose (torch .Tensor (ov_logits ), ref_logits , atol = 5e-3 ),
0 commit comments