vllm-project
diff --git a/‎csrc/quantization/machete/generate.py‎
Lines changed: 0 additions & 5 deletions b/‎csrc/quantization/machete/generate.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎examples/others/tensorize_vllm_model.py‎
Lines changed: 95 additions & 73 deletions b/‎examples/others/tensorize_vllm_model.py‎
Lines changed: 95 additions & 73 deletions
diff --git a/‎tests/compile/test_silu_mul_quant_fusion.py‎
Lines changed: 0 additions & 5 deletions b/‎tests/compile/test_silu_mul_quant_fusion.py‎
Lines changed: 0 additions & 5 deletions
@@ -12,9 +12,6 @@
 from typing import Optional, Union
 
 import jinja2
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm_cutlass_library_extension import (
     DataType,
     EpilogueScheduleTag,
@@ -31,8 +28,6 @@
     VLLMKernelScheduleTag,
 )
 
-# yapf: enable
-
 #
 #   Generator templating
 #
 
@@ -21,8 +21,6 @@
 logger = logging.getLogger()
 
 
-# yapf conflicts with isort for this docstring
-# yapf: disable
 """
 tensorize_vllm_model.py is a script that can be used to serialize and 
 deserialize vLLM models. These models can be loaded using tensorizer 
@@ -132,7 +130,8 @@ def get_parser():
         "can be loaded using tensorizer directly to the GPU "
         "extremely quickly. Tensor encryption and decryption is "
         "also supported, although libsodium must be installed to "
-        "use it.")
+        "use it."
+    )
     parser = EngineArgs.add_cli_args(parser)
 
     parser.add_argument(
@@ -144,13 +143,14 @@ def get_parser():
         "along with the model by instantiating a TensorizerConfig object, "
         "creating a dict from it with TensorizerConfig.to_serializable(), "
         "and passing it to LoRARequest's initializer with the kwarg "
-        "tensorizer_config_dict."
+        "tensorizer_config_dict.",
     )
 
-    subparsers = parser.add_subparsers(dest='command', required=True)
+    subparsers = parser.add_subparsers(dest="command", required=True)
 
     serialize_parser = subparsers.add_parser(
-        'serialize', help="Serialize a model to `--serialized-directory`")
+        "serialize", help="Serialize a model to `--serialized-directory`"
+    )
 
     serialize_parser.add_argument(
         "--suffix",
@@ -163,7 +163,9 @@ def get_parser():
             "`--suffix` is `v1`, the serialized model tensors will be "
             "saved to "
             "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
-            "If none is provided, a random UUID will be used."))
+            "If none is provided, a random UUID will be used."
+        ),
+    )
     serialize_parser.add_argument(
         "--serialized-directory",
         type=str,
@@ -175,108 +177,127 @@ def get_parser():
         "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
         "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
         "where `suffix` is given by `--suffix` or a random UUID if not "
-        "provided.")
+        "provided.",
+    )
 
     serialize_parser.add_argument(
         "--serialization-kwargs",
         type=tensorizer_kwargs_arg,
         required=False,
-        help=("A JSON string containing additional keyword arguments to "
-              "pass to Tensorizer's TensorSerializer during "
-              "serialization."))
+        help=(
+            "A JSON string containing additional keyword arguments to "
+            "pass to Tensorizer's TensorSerializer during "
+            "serialization."
+        ),
+    )
 
     serialize_parser.add_argument(
         "--keyfile",
         type=str,
         required=False,
-        help=("Encrypt the model weights with a randomly-generated binary key,"
-              " and save the key at this path"))
+        help=(
+            "Encrypt the model weights with a randomly-generated binary key,"
+            " and save the key at this path"
+        ),
+    )
 
     deserialize_parser = subparsers.add_parser(
-        'deserialize',
-        help=("Deserialize a model from `--path-to-tensors`"
-              " to verify it can be loaded and used."))
+        "deserialize",
+        help=(
+            "Deserialize a model from `--path-to-tensors`"
+            " to verify it can be loaded and used."
+        ),
+    )
 
     deserialize_parser.add_argument(
         "--path-to-tensors",
         type=str,
         required=False,
-        help="The local path or S3 URI to the model tensors to deserialize. ")
+        help="The local path or S3 URI to the model tensors to deserialize. ",
+    )
 
     deserialize_parser.add_argument(
         "--serialized-directory",
         type=str,
         required=False,
         help="Directory with model artifacts for loading. Assumes a "
-             "model.tensors file exists therein. Can supersede "
-             "--path-to-tensors.")
+        "model.tensors file exists therein. Can supersede "
+        "--path-to-tensors.",
+    )
 
     deserialize_parser.add_argument(
         "--keyfile",
         type=str,
         required=False,
-        help=("Path to a binary key to use to decrypt the model weights,"
-              " if the model was serialized with encryption"))
+        help=(
+            "Path to a binary key to use to decrypt the model weights,"
+            " if the model was serialized with encryption"
+        ),
+    )
 
     deserialize_parser.add_argument(
         "--deserialization-kwargs",
         type=tensorizer_kwargs_arg,
         required=False,
-        help=("A JSON string containing additional keyword arguments to "
-              "pass to Tensorizer's `TensorDeserializer` during "
-              "deserialization."))
+        help=(
+            "A JSON string containing additional keyword arguments to "
+            "pass to Tensorizer's `TensorDeserializer` during "
+            "deserialization."
+        ),
+    )
 
     TensorizerArgs.add_cli_args(deserialize_parser)
 
     return parser
 
-def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
-                                              cfg: TensorizerConfig):
+
+def merge_extra_config_with_tensorizer_config(extra_cfg: dict, cfg: TensorizerConfig):
     for k, v in extra_cfg.items():
         if hasattr(cfg, k):
             setattr(cfg, k, v)
             logger.info(
                 "Updating TensorizerConfig with %s from "
-                "--model-loader-extra-config provided", k
+                "--model-loader-extra-config provided",
+                k,
             )
 
+
 def deserialize(args, tensorizer_config):
     if args.lora_path:
         tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
-        llm = LLM(model=args.model,
-                  load_format="tensorizer",
-                  tensor_parallel_size=args.tensor_parallel_size,
-                  model_loader_extra_config=tensorizer_config,
-                  enable_lora=True,
+        llm = LLM(
+            model=args.model,
+            load_format="tensorizer",
+            tensor_parallel_size=args.tensor_parallel_size,
+            model_loader_extra_config=tensorizer_config,
+            enable_lora=True,
         )
         sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=256,
-            stop=["[/assistant]"]
+            temperature=0, max_tokens=256, stop=["[/assistant]"]
         )
 
         # Truncating this as the extra text isn't necessary
-        prompts = [
-            "[user] Write a SQL query to answer the question based on ..."
-        ]
+        prompts = ["[user] Write a SQL query to answer the question based on ..."]
 
         # Test LoRA load
         print(
             llm.generate(
-            prompts,
-            sampling_params,
-            lora_request=LoRARequest("sql-lora",
-                                     1,
-                                     args.lora_path,
-                                     tensorizer_config_dict = tensorizer_config
-                                     .to_serializable())
+                prompts,
+                sampling_params,
+                lora_request=LoRARequest(
+                    "sql-lora",
+                    1,
+                    args.lora_path,
+                    tensorizer_config_dict=tensorizer_config.to_serializable(),
+                ),
             )
         )
     else:
-        llm = LLM(model=args.model,
-                  load_format="tensorizer",
-                  tensor_parallel_size=args.tensor_parallel_size,
-                  model_loader_extra_config=tensorizer_config
+        llm = LLM(
+            model=args.model,
+            load_format="tensorizer",
+            tensor_parallel_size=args.tensor_parallel_size,
+            model_loader_extra_config=tensorizer_config,
         )
     return llm
 
@@ -285,17 +306,20 @@ def main():
     parser = get_parser()
     args = parser.parse_args()
 
-    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
-                        or os.environ.get("S3_ACCESS_KEY_ID", None))
-    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
-                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
-    s3_endpoint = (getattr(args, 's3_endpoint', None)
-                or os.environ.get("S3_ENDPOINT_URL", None))
+    s3_access_key_id = getattr(args, "s3_access_key_id", None) or os.environ.get(
+        "S3_ACCESS_KEY_ID", None
+    )
+    s3_secret_access_key = getattr(
+        args, "s3_secret_access_key", None
+    ) or os.environ.get("S3_SECRET_ACCESS_KEY", None)
+    s3_endpoint = getattr(args, "s3_endpoint", None) or os.environ.get(
+        "S3_ENDPOINT_URL", None
+    )
 
     credentials = {
         "s3_access_key_id": s3_access_key_id,
         "s3_secret_access_key": s3_secret_access_key,
-        "s3_endpoint": s3_endpoint
+        "s3_endpoint": s3_endpoint,
     }
 
     model_ref = args.model
@@ -309,25 +333,25 @@ def main():
     if args.model_loader_extra_config:
         extra_config = json.loads(args.model_loader_extra_config)
 
-
-    tensorizer_dir = (args.serialized_directory or
-                      extra_config.get("tensorizer_dir"))
-    tensorizer_uri = (getattr(args, "path_to_tensors", None)
-                      or extra_config.get("tensorizer_uri"))
+    tensorizer_dir = args.serialized_directory or extra_config.get("tensorizer_dir")
+    tensorizer_uri = getattr(args, "path_to_tensors", None) or extra_config.get(
+        "tensorizer_uri"
+    )
 
     if tensorizer_dir and tensorizer_uri:
-        parser.error("--serialized-directory and --path-to-tensors "
-                     "cannot both be provided")
+        parser.error(
+            "--serialized-directory and --path-to-tensors cannot both be provided"
+        )
 
     if not tensorizer_dir and not tensorizer_uri:
-        parser.error("Either --serialized-directory or --path-to-tensors "
-                     "must be provided")
-
+        parser.error(
+            "Either --serialized-directory or --path-to-tensors must be provided"
+        )
 
     if args.command == "serialize":
         engine_args = EngineArgs.from_cli_args(args)
 
-        input_dir = tensorizer_dir.rstrip('/')
+        input_dir = tensorizer_dir.rstrip("/")
         suffix = args.suffix if args.suffix else uuid.uuid4().hex
         base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
         if engine_args.tensor_parallel_size > 1:
@@ -339,15 +363,14 @@ def main():
             tensorizer_uri=model_path,
             encryption_keyfile=keyfile,
             serialization_kwargs=args.serialization_kwargs or {},
-            **credentials
+            **credentials,
         )
 
         if args.lora_path:
             tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
             tensorize_lora_adapter(args.lora_path, tensorizer_config)
 
-        merge_extra_config_with_tensorizer_config(extra_config,
-                                                  tensorizer_config)
+        merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
         tensorize_vllm_model(engine_args, tensorizer_config)
 
     elif args.command == "deserialize":
@@ -356,11 +379,10 @@ def main():
             tensorizer_dir=args.serialized_directory,
             encryption_keyfile=keyfile,
             deserialization_kwargs=args.deserialization_kwargs or {},
-            **credentials
+            **credentials,
         )
 
-        merge_extra_config_with_tensorizer_config(extra_config,
-                                                  tensorizer_config)
+        merge_extra_config_with_tensorizer_config(extra_config, tensorizer_config)
         deserialize(args, tensorizer_config)
     else:
         raise ValueError("Either serialize or deserialize must be specified.")
 
@@ -8,16 +8,11 @@
 import vllm.envs as envs
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.compilation.activation_quant_fusion import (
     FUSED_OPS,
     SILU_MUL_OP,
     ActivationQuantFusionPass,
 )
-
-# yapf: enable
 from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
Original file line number	Diff line number	Diff line change
`@@ -12,9 +12,6 @@`
`12`	`12`	`from typing import Optional, Union`
`13`	`13`
`14`	`14`	`import jinja2`
`15`		`-`
`16`		`-# yapf conflicts with isort for this block`
`17`		`-# yapf: disable`
`18`	`15`	`from vllm_cutlass_library_extension import (`
`19`	`16`	`DataType,`
`20`	`17`	`EpilogueScheduleTag,`
`@@ -31,8 +28,6 @@`
`31`	`28`	`VLLMKernelScheduleTag,`
`32`	`29`	`)`
`33`	`30`
`34`		`-# yapf: enable`
`35`		`-`
`36`	`31`	`#`
`37`	`32`	`# Generator templating`
`38`	`33`	`#`