diff --git a/Android.bp b/Android.bp new file mode 100644 index 00000000000..1445c50c152 --- /dev/null +++ b/Android.bp @@ -0,0 +1,139 @@ +cc_library { + name: "libexecutorch", + srcs: [ + "extension/data_loader/file_data_loader.cpp", + "extension/data_loader/mmap_data_loader.cpp", + "extension/evalue_util/print_evalue.cpp", + "extension/runner_util/inputs.cpp", + "extension/runner_util/inputs_portable.cpp", + + ":executorch_portable_ops_gen_cpp", + + "kernels/portable/cpu/*.cpp", + "kernels/portable/cpu/pattern/*.cpp", + "kernels/portable/cpu/util/*.cpp", + + "kernels/prim_ops/et_copy_index.cpp", + "kernels/prim_ops/et_view.cpp", + "kernels/prim_ops/register_prim_ops.cpp", + + "runtime/backend/interface.cpp", + "runtime/core/evalue.cpp", + "runtime/core/exec_aten/util/tensor_util_portable.cpp", + "runtime/core/portable_type/tensor_impl.cpp", + "runtime/executor/method.cpp", + "runtime/executor/method_meta.cpp", + "runtime/executor/program.cpp", + "runtime/executor/tensor_parser_exec_aten.cpp", + "runtime/executor/tensor_parser_portable.cpp", + "runtime/kernel/operator_registry.cpp", + "runtime/platform/abort.cpp", + "runtime/platform/default/posix.cpp", + "runtime/platform/log.cpp", + "runtime/platform/profiler.cpp", + "runtime/platform/runtime.cpp", + + "schema/extended_header.cpp", + ], + include_dirs: ["external/pytorch"], + header_libs: [ + "flatbuffer_headers", + ], + generated_headers: [ + "executorch_fbs_gen", + "executorch_portable_ops_gen_h", + ], + cflags: [ + "-Wno-unused-parameter", + ], + host_supported: true, +} + +cc_binary { + name: "executor_runner", + srcs: ["examples/portable/executor_runner/executor_runner.cpp"], + static_libs: [ + "libgflags", + ], + whole_static_libs: [ + "libexecutorch", + ], + include_dirs: ["external/pytorch"], + host_supported: true, +} + +cc_genrule { + name: "executorch_fbs_gen", + host_supported: true, + tools: ["flatc"], + srcs: [ + "schema/program.fbs", + "schema/scalar_type.fbs" + ], + out: [ + "executorch/schema/program_generated.h", + "executorch/schema/scalar_type_generated.h" + ], + cmd: "$(location flatc) --cpp --cpp-std c++11 --gen-mutable --scoped-enums --include-prefix executorch/schema/ " + + "-o $(genDir)/executorch/schema/ $(in)" +} + +filegroup { + name: "executorch_codegen_templates", + srcs: [ + "codegen/templates/*.cpp", + "codegen/templates/*.h", + "codegen/templates/*.ini", + ], +} + +filegroup { + name: "executorch_portable_yaml", + srcs: [ + "kernels/portable/functions.yaml", + ], +} + +cc_genrule { + name: "executorch_portable_ops_gen_cpp", + host_supported: true, + tools: ["torchgen_executorch"], + tool_files: [ + ":executorch_codegen_templates", + ":executorch_portable_yaml", + ":torchgen_native_functions", + ":torchgen_tags", + ], + out: [ + "RegisterCodegenUnboxedKernels_0.cpp", + ], + cmd: "mkdir templates; cp $(locations :executorch_codegen_templates) templates;" + + "$(location torchgen_executorch) " + + "--source-path=. " + + "--tags-path=$(location :torchgen_tags) " + + "--aten_yaml_path=$(location :torchgen_native_functions) " + + "--functions_yaml_path=$(location :executorch_portable_yaml) " + + "--install_dir=$(genDir)/" +} + +cc_genrule { + name: "executorch_portable_ops_gen_h", + host_supported: true, + tools: ["torchgen_executorch"], + tool_files: [ + ":executorch_codegen_templates", + ":executorch_portable_yaml", + ":torchgen_native_functions", + ":torchgen_tags", + ], + out: [ + "NativeFunctions.h", + ], + cmd: "mkdir templates; cp $(locations :executorch_codegen_templates) templates;" + + "$(location torchgen_executorch) " + + "--source-path=. " + + "--tags-path=$(location :torchgen_tags) " + + "--aten_yaml_path=$(location :torchgen_native_functions) " + + "--functions_yaml_path=$(location :executorch_portable_yaml) " + + "--install_dir=$(genDir)/" +} diff --git a/torchgen/Android.bp b/torchgen/Android.bp new file mode 100644 index 00000000000..f39b59acda9 --- /dev/null +++ b/torchgen/Android.bp @@ -0,0 +1,28 @@ +python_library_host { + name: "torchgen_library", + srcs: [ + "**/*.py" + ], + pkg_path: "torchgen", + libs: [ + "pyyaml", + "typing_extensions", + ], +} + +python_binary_host { + name: "torchgen_executorch", + main: "gen_executorch.py", + srcs: ["gen_executorch.py"], + libs: ["torchgen_library"], +} + +filegroup { + name: "torchgen_native_functions", + srcs: ["native_functions.yaml"], +} + +filegroup { + name: "torchgen_tags", + srcs: ["tags.yaml"], +} diff --git a/torchgen/BUCK.oss b/torchgen/BUCK.oss new file mode 100644 index 00000000000..50774c38f7e --- /dev/null +++ b/torchgen/BUCK.oss @@ -0,0 +1,23 @@ +python_library( + name = "torchgen", + srcs = glob( + ["**/*.py"], + ), + base_module = "torchgen", + visibility = ["PUBLIC"], + deps = [ + "//third_party:pyyaml", + "//third_party:typing-extensions", + ], +) + +python_binary( + name = "gen", + main_module = "torchgen.gen", + visibility = [ + "PUBLIC", + ], + deps = [ + ":torchgen", + ], +) diff --git a/torchgen/BUILD.bazel b/torchgen/BUILD.bazel new file mode 100644 index 00000000000..d1a0db360d2 --- /dev/null +++ b/torchgen/BUILD.bazel @@ -0,0 +1,4 @@ +load("//:tools/bazel.bzl", "rules") +load(":build.bzl", "define_targets") + +define_targets(rules = rules) diff --git a/torchgen/__init__.py b/torchgen/__init__.py new file mode 100644 index 00000000000..2d5dbf0667a --- /dev/null +++ b/torchgen/__init__.py @@ -0,0 +1,10 @@ +"""torchgen + +This module contains codegeneration utilities for PyTorch. It is used to +build PyTorch from source, but may also be used for out-of-tree projects +that extend PyTorch. + +Note well that we provide no BC guarantees for torchgen. If you're interested +in using torchgen and want the PyTorch team to be aware, please reach out +on GitHub. +""" diff --git a/torchgen/_autoheuristic/README.md b/torchgen/_autoheuristic/README.md new file mode 100644 index 00000000000..58613e54fb8 --- /dev/null +++ b/torchgen/_autoheuristic/README.md @@ -0,0 +1,130 @@ +# AutoHeuristic +AutoHeuristic is a framework that allows one to use results from autotuning to learn a heuristic as a decision tree, that can be generated to code and shipped with compiler. + +## How to use AutoHeuristic +In general, the following steps have to performed: +- The AutoHeursitic constructor has to be called. +- A script that runs benchmarks in order to collect training data has to be implemented. +- The train_decision.py (if you want to learn a decision tree) or train_regression.py (if you want to learn a regression tree) script has to be run in order to learn the heuristic and generate it to code. + +## Step 1: Calling the AutoHeuristic constructor +Currently, two use cases are supported: + +### Use case 1: Local autotuning +When your feedback function is able to immediately return a result, you can just call the AutoHeuristic constructor. This is done e.g. for pad_mm +``` +autoheuristic = AutoHeuristic( + fallback=fallback, + choices=choices, + feedback=feedback, + context=context, + name=name, + augment_context=pad_mm_operations(), + precondition=pad_mm_precondition, +) +``` +Here, `feedback` is a function that benchmarks a given choice and returns the execution time. For an example, see: https://github.com/pytorch/pytorch/blob/main/torch/_inductor/fx_passes/pad_mm.py. + +### Use case 2: Kernel choice selection +If you want to use AutoHeuristic for kernel choice selection, you have to call the AutoHeuristicSelectAlgorithm constructor. This is done e.g. for mixed_mm +``` +autoheuristic = AutoHeuristicSelectAlgorithm( + fallback=fallback, + choices=choices, + input_nodes=input_nodes, + context=context, + name=name, + augment_context=ops, + precondition=precondition, +) +``` +This call has to be followed by a call to `autotune_select_algorithm()`, +``` +autotune_select_algorithm(name, choices, input_nodes, layout) +``` +Note that `choices`, `input_nodes`, and `name` in the `AutoHeuristicSelectAlgorithm()` and `autotune_select_algorithm()` calls have to match when you want to use AutoHeuristic to collect data. + +For an example, see: https://github.com/pytorch/pytorch/blob/main/torch/_inductor/kernel/mm.py + +## Step 2: Collecting training data +After adding the call to the AutoHeuristic constructor, you need to collect training data in order to learn a heuristic. Let's say you have a script `run.py` that triggers the AutoHeuristic constructor that you just added. Run the following command in order to store data into file `train.txt`: +``` +TORCHINDUCTOR_AUTOHEURISTIC_LOG_PATH="train.txt" \ + TORCHINDUCTOR_AUTOHEURISTIC_COLLECT="pad_mm" python run.py +``` +Replace "pad_mm" with the name you provided in the call to the AutoHeuristic constructor. + +AutoHeuristic provides a `BenchmarkRunner` class (https://github.com/pytorch/pytorch/blob/main/torchgen/_autoheuristic/benchmark_runner.py) that simplifies the process of collecting data. To use it, create a new class that subclasses `BenchmarkRunner`, and implements the `run_benchmark()` and `create_input()` methods. + +These examples might be helpful: +- https://github.com/pytorch/pytorch/blob/main/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py +- https://github.com/pytorch/pytorch/blob/main/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py + + +## Step 3: Learning a heuristic and using it +Once you have collected enough training data, you are ready to learn a heuristic: +``` +python torchgen/_autoheuristic/train_decision.py train.txt --heuristic-name SimpleHeuristic +``` +will learn a heuristic and generate it to `torch/_inductor/autoheuristic/artifacts/_SimpleHeuristic.py`. + +You can now use your learned heuristic: +``` +TORCHINDUCTOR_AUTOHEURISTIC_USE="pad_mm" python run.py +``` +Here, you again have to replace "pad_mm" with the name you provided in the call to the AutoHeuristic constructor. + +Instead of just running the `train_decision.py` script, you probably want to customize the training process in some way. To do this, create a new class that subclasses `AHTrainDecision` and override methods you want to customize. Here are some examples: +- https://github.com/pytorch/pytorch/blob/main/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py +- https://github.com/pytorch/pytorch/blob/main/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py + +## Other + +### How do I specify features that the heuristic is going to use to make a decision? +The AutoHeuristic constructor requires a `context` argument of type `AHContext`, which will contain all features. You specify features in the following way: +``` +context = AHContext() + +# adding numerical features +context.add_feature("m", mat1.shape[0]) +context.add_feature("k", mat1.shape[1]) + +# adding a categorical feture +context.add_feature("mat1_dtype", mat1.dtype, is_categorical=True) +``` + +You might want to use features that are a combination of other features, such as `m*k`. You can of course add such features in the same way as above, i.e., +``` +context.add_feature("m*k", mat1.shape[0] * mat1.shape[1]) +``` +but AutoHeuristic also provides a way to 'augment' features. Augmented features are not stored when data is collected, instead they are created before a heuristic is learned, or before a learned heuristic is used. You can specify such augmented features by creating a list of `AHOperation` objects: +``` +def m_times_k(data: Any) -> float: + return data['m'] * data['k'] + +m_times_k_op = AHOperation("m*k', m_times_k) +ah_operations = [m_times_k_op] + +# specify augmented features by setting `augment_context` to `ah_operations` +autoheuristic = AutoHeuristic(..., augment_context=ah_operations, ...) +``` + +Note that you also have to specify these operations when you want to learn a heuristic. Look at the `add_new_features()` method in these examples, to see how it is done: +- https://github.com/pytorch/pytorch/blob/main/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py +- https://github.com/pytorch/pytorch/blob/main/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py + +### Where has AutoHeuristic already been used? +Take a look at the following PRs in which AutoHeuristic has enabled for various optimizations. +Looking at these examples may be helpful if you want to use AutoHeuristic yourself. +- pad_mm: https://github.com/pytorch/pytorch/pull/128643 +- mixed_mm: + - Enabling of AutoHeuristic: https://github.com/pytorch/pytorch/pull/131610 + - Script to collect data: https://github.com/pytorch/pytorch/pull/131611 + - A100 heuristic: https://github.com/pytorch/pytorch/pull/131613 + - H100 heuristic: https://github.com/pytorch/pytorch/pull/132685 +- flex_attention: https://github.com/pytorch/pytorch/pull/130398 +- mm (heuristic for ranking choices): + - https://github.com/pytorch/pytorch/pull/131615 + - https://github.com/pytorch/pytorch/pull/131617 + - https://github.com/pytorch/pytorch/pull/131705 + - https://github.com/pytorch/pytorch/pull/131714 diff --git a/torchgen/_autoheuristic/ah_tree.py b/torchgen/_autoheuristic/ah_tree.py new file mode 100644 index 00000000000..3991ffc87f8 --- /dev/null +++ b/torchgen/_autoheuristic/ah_tree.py @@ -0,0 +1,262 @@ +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +from sklearn.tree import _tree # type: ignore[import-untyped] + + +class DecisionTreeNode: + def __init__( + self, + feature: Optional[str] = None, + threshold: Optional[float] = None, + left: Optional["DecisionTreeNode"] = None, + right: Optional["DecisionTreeNode"] = None, + class_probs: Any = None, + num_samples: int = 0, + node_id: int = 0, + ) -> None: + self.feature = feature + self.threshold = threshold + self.left = left + self.right = right + self.class_probs = class_probs + self.num_samples = num_samples + self.id = node_id + + def is_leaf(self) -> bool: + return self.left is None or self.right is None + + +class DecisionTree: + """ + Custom decision tree implementation that mimics some of the sklearn API. + The purpose of this class it to be able to perform transformations, such as custom pruning, which + does not seem to be easy with sklearn. + """ + + def __init__(self, sklearn_tree: Any, feature_names: List[str]) -> None: + self.feature_names = feature_names + self.root = self._convert_sklearn_tree(sklearn_tree.tree_) + self.classes_: List[str] = sklearn_tree.classes_ + + def _convert_sklearn_tree( + self, sklearn_tree: Any, node_id: int = 0 + ) -> DecisionTreeNode: + class_probs = sklearn_tree.value[node_id][0] + num_samples = sklearn_tree.n_node_samples[node_id] + if sklearn_tree.feature[node_id] != _tree.TREE_UNDEFINED: + feature_index = sklearn_tree.feature[node_id] + feature = self.feature_names[feature_index] + left = self._convert_sklearn_tree( + sklearn_tree, sklearn_tree.children_left[node_id] + ) + right = self._convert_sklearn_tree( + sklearn_tree, sklearn_tree.children_right[node_id] + ) + return DecisionTreeNode( + feature=feature, + threshold=sklearn_tree.threshold[node_id], + left=left, + right=right, + class_probs=class_probs, + num_samples=num_samples, + node_id=node_id, + ) + else: + return DecisionTreeNode( + class_probs=class_probs, num_samples=num_samples, node_id=node_id + ) + + def prune(self, df: Any, target_col: str, k: int) -> None: + self.root = self._prune_tree(self.root, df, target_col, k) + + def _prune_tree( + self, node: DecisionTreeNode, df: Any, target_col: str, k: int + ) -> DecisionTreeNode: + if node.is_leaf(): + return node + + left_df = df[df[node.feature] <= node.threshold] + right_df = df[df[node.feature] > node.threshold] + + # number of unique classes in the left and right subtrees + left_counts = left_df[target_col].nunique() + right_counts = right_df[target_col].nunique() + + # for ranking, we want to ensure that we return at least k classes, so if we have less than k classes in the + # left or right subtree, we remove the split and make this node a leaf node + if left_counts < k or right_counts < k: + return DecisionTreeNode(class_probs=node.class_probs) + + assert node.left is not None, "expected left child to exist" + node.left = self._prune_tree(node.left, left_df, target_col, k) + assert node.right is not None, "expected right child to exist" + node.right = self._prune_tree(node.right, right_df, target_col, k) + + return node + + def to_dot(self) -> str: + dot = "digraph DecisionTree {\n" + dot += ' node [fontname="helvetica"];\n' + dot += ' edge [fontname="helvetica"];\n' + dot += self._node_to_dot(self.root) + dot += "}" + return dot + + def _node_to_dot( + self, node: DecisionTreeNode, parent_id: int = 0, edge_label: str = "" + ) -> str: + if node is None: + return "" + + node_id = id(node) + + # Format class_probs array with line breaks + class_probs_str = self._format_class_probs_array( + node.class_probs, node.num_samples + ) + + if node.is_leaf(): + label = class_probs_str + shape = "box" + else: + feature_name = f"{node.feature}" + label = f"{feature_name} <= {node.threshold:.2f}\\n{class_probs_str}" + shape = "oval" + + dot = f' {node_id} [label="{label}", shape={shape}];\n' + + if parent_id != 0: + dot += f' {parent_id} -> {node_id} [label="{edge_label}"];\n' + + if not node.is_leaf(): + assert node.left is not None, "expected left child to exist" + dot += self._node_to_dot(node.left, node_id, "<=") + assert node.right is not None, "expected right child to exist" + dot += self._node_to_dot(node.right, node_id, ">") + + return dot + + def _format_class_prob(self, num: float) -> str: + if num == 0: + return "0" + return f"{num:.2f}" + + def _format_class_probs_array( + self, class_probs: Any, num_samples: int, max_per_line: int = 5 + ) -> str: + # add line breaks to avoid very long lines + flat_class_probs = class_probs.flatten() + formatted = [self._format_class_prob(v) for v in flat_class_probs] + lines = [ + formatted[i : i + max_per_line] + for i in range(0, len(formatted), max_per_line) + ] + return f"num_samples={num_samples}\\n" + "\\n".join( + [", ".join(line) for line in lines] + ) + + def predict(self, X: Any) -> Any: + predictions = [self._predict_single(x) for _, x in X.iterrows()] + return np.array(predictions) + + def predict_proba(self, X: Any) -> Any: + return np.array([self._predict_proba_single(x) for _, x in X.iterrows()]) + + def _get_leaf(self, X: Any) -> DecisionTreeNode: + node = self.root + while not node.is_leaf(): + if X[node.feature] <= node.threshold: + assert node.left is not None, "expected left child to exist" + node = node.left + else: + assert node.right is not None, "expected right child to exist" + node = node.right + return node + + def _predict_single(self, x: Any) -> str: + node = self._get_leaf(x) + # map index to class name + return self.classes_[np.argmax(node.class_probs)] + + def _predict_proba_single(self, x: Any) -> Any: + node = self._get_leaf(x) + return node.class_probs + + def apply(self, X: Any) -> Any: + ids = [self._apply_single(x) for _, x in X.iterrows()] + return np.array(ids) + + def _apply_single(self, x: Any) -> int: + node = self._get_leaf(x) + return node.id + + def codegen( + self, + dummy_col_2_col_val: Dict[str, Tuple[str, Any]], + lines: List[str], + unsafe_leaves: List[int], + ) -> None: + # generates python code for the decision tree + def codegen_node(node: DecisionTreeNode, depth: int) -> None: + indent = " " * (depth + 1) + if node.is_leaf(): + lines.append(handle_leaf(node, indent, unsafe_leaves)) + else: + name = node.feature + threshold = node.threshold + if name in dummy_col_2_col_val: + (orig_name, value) = dummy_col_2_col_val[name] + predicate = f"{indent}if str(context.get_value('{orig_name}')) != '{value}':" + assert ( + threshold == 0.5 + ), f"expected threshold to be 0.5 but is {threshold}" + else: + predicate = ( + f"{indent}if context.get_value('{name}') <= {threshold}:" + ) + lines.append(predicate) + assert node.left is not None, "expected left child to exist" + codegen_node(node.left, depth + 1) + lines.append(f"{indent}else:") + assert node.right is not None, "expected right child to exist" + codegen_node(node.right, depth + 1) + + def handle_leaf( + node: DecisionTreeNode, indent: str, unsafe_leaves: List[int] + ) -> str: + """ + This generates the code for a leaf node in the decision tree. If the leaf is unsafe, the learned heuristic + will return "unsure" (i.e. None). + """ + if node.id in unsafe_leaves: + return f"{indent}return None" + class_probas = node.class_probs + return f"{indent}return {best_probas_and_indices(class_probas)}" + + def best_probas_and_indices(class_probas: Any) -> str: + """ + Given a list of tuples (proba, idx), this function returns a string in which the tuples are + sorted by proba in descending order. E.g.: + Given class_probas=[(0.3, 0), (0.5, 1), (0.2, 2)] + this function returns + "[(0.5, 1), (0.3, 0), (0.2, 2)]" + """ + # we generate a list of tuples (proba, idx) sorted by proba in descending order + # idx is the index of a choice + # we only generate a tuple if proba > 0 + probas_indices_sorted = sorted( + [ + (proba, index) + for index, proba in enumerate(class_probas) + if proba > 0 + ], + key=lambda x: x[0], + reverse=True, + ) + probas_indices_sorted_str = ", ".join( + f"({value:.3f}, {index})" for value, index in probas_indices_sorted + ) + return f"[{probas_indices_sorted_str}]" + + codegen_node(self.root, 1) diff --git a/torchgen/_autoheuristic/benchmark_runner.py b/torchgen/_autoheuristic/benchmark_runner.py new file mode 100644 index 00000000000..999ea48cbe1 --- /dev/null +++ b/torchgen/_autoheuristic/benchmark_runner.py @@ -0,0 +1,82 @@ +import argparse +import random +import time +from abc import abstractmethod +from typing import Any, Tuple + +from tqdm import tqdm # type: ignore[import-untyped] + +import torch + + +class BenchmarkRunner: + """ + BenchmarkRunner is a base class for all benchmark runners. It provides an interface to run benchmarks in order to + collect data with AutoHeuristic. + """ + + def __init__(self, name: str) -> None: + self.name = name + self.parser = argparse.ArgumentParser() + self.add_base_arguments() + self.args = None + + def add_base_arguments(self) -> None: + self.parser.add_argument( + "--device", + type=int, + default=None, + help="torch.cuda.set_device(device) will be used", + ) + self.parser.add_argument( + "--use-heuristic", + action="store_true", + help="Use learned heuristic instead of collecting data.", + ) + self.parser.add_argument( + "-o", + type=str, + default="ah_data.txt", + help="Path to file where AutoHeuristic will log results.", + ) + self.parser.add_argument( + "--num-samples", + type=int, + default=1000, + help="Number of samples to collect.", + ) + self.parser.add_argument( + "--num-reps", + type=int, + default=3, + help="Number of measurements to collect for each input.", + ) + + def run(self) -> None: + torch.set_default_device("cuda") + args = self.parser.parse_args() + if args.use_heuristic: + torch._inductor.config.autoheuristic_use = self.name + torch._inductor.config.autoheuristic_collect = "" + else: + torch._inductor.config.autoheuristic_use = "" + torch._inductor.config.autoheuristic_collect = self.name + torch._inductor.config.autoheuristic_log_path = args.o + if args.device is not None: + torch.cuda.set_device(args.device) + random.seed(time.time()) + self.main(args.num_samples, args.num_reps) + + @abstractmethod + def run_benchmark(self, *args: Any) -> None: + ... + + @abstractmethod + def create_input(self) -> Tuple[Any, ...]: + ... + + def main(self, num_samples: int, num_reps: int) -> None: + for _ in tqdm(range(num_samples)): + input = self.create_input() + for _ in range(num_reps): + self.run_benchmark(*input) diff --git a/torchgen/_autoheuristic/benchmark_utils.py b/torchgen/_autoheuristic/benchmark_utils.py new file mode 100644 index 00000000000..b3e19c101d2 --- /dev/null +++ b/torchgen/_autoheuristic/benchmark_utils.py @@ -0,0 +1,62 @@ +import random +from typing import Any, Tuple + +import torch + + +def transpose_tensors(p_transpose_both: float = 0.05) -> Tuple[bool, bool]: + transpose_both = random.choices( + [True, False], [p_transpose_both, 1 - p_transpose_both] + )[0] + if transpose_both: + return (True, True) + transpose_left = (True, False) + transpose_right = (False, True) + no_transpose = (False, False) + return random.choices([transpose_left, transpose_right, no_transpose])[0] + + +def fits_in_memory(dtype: Any, m: int, k: int, n: int) -> Any: + threshold_memory = torch.cuda.get_device_properties(0).total_memory / 4 + # dividing by 4 beause we otherwise sometimes run out of memory, I assume because + # inductor creates copies of tensors for benchmarking? + return dtype.itemsize * (m * k + k * n + m * n) < threshold_memory + + +def get_mm_tensors( + m: int, + k: int, + n: int, + transpose_left: bool, + transpose_right: bool, + dtype_left: Any, + dtype_right: Any, +) -> Tuple[Any, Any]: + if transpose_left: + a = torch.randn(k, m, dtype=dtype_left).t() + else: + a = torch.randn(m, k, dtype=dtype_left) + + if transpose_right: + b = torch.randn(n, k, dtype=dtype_right).t() + else: + b = torch.randn(k, n, dtype=dtype_right) + return (a, b) + + +def set_precision(dtype: Any, p_float32_prec_highest: float = 0.8) -> None: + if dtype == torch.float32: + precisions = ["high", "highest"] + weights = [1 - p_float32_prec_highest, p_float32_prec_highest] + precision = random.choices(precisions, weights)[0] + else: + precision = "high" + torch.set_float32_matmul_precision(precision) + + +def get_random_between_pow2(min_power2: int, max_power2: int) -> int: + i = random.randint(min_power2, max_power2 - 1) + lower = 2**i + 1 + upper = 2 ** (i + 1) - 1 + assert lower <= upper, "lower must not be greater than upper" + return random.randint(lower, upper) diff --git a/torchgen/_autoheuristic/collect_data.sh b/torchgen/_autoheuristic/collect_data.sh new file mode 100644 index 00000000000..442f6120327 --- /dev/null +++ b/torchgen/_autoheuristic/collect_data.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# this script makes it easy parallize collecting data across using multiple GPUs + +# Check if tmux is installed +if ! command -v tmux &> /dev/null; then + echo "tmux is not installed. Please install it and try again." + exit 1 +fi + +# Check if the correct number of arguments is provided +if [ "$#" -ne 5 ]; then + echo "Usage: $0 \"\" " + echo "Example: $0 \"python run.py --a b --b c\" 1,4,5,3 1000 pytorch-3.10 a100" + exit 1 +fi + +PYTHON_COMMAND=$1 +DEVICE_NUMBERS=$2 +NUM_SAMPLES=$3 +CONDA_ENV=$4 +OUTPUT_DIR=$5 + +# Create a new tmux session +SESSION_NAME="parallel_run_$(date +%s)" +tmux new-session -d -s "$SESSION_NAME" + +# Split the device numbers +IFS=',' read -ra DEVICES <<< "$DEVICE_NUMBERS" + +NUM_GPUS=${#DEVICES[@]} +NUM_SAMPLES_PER_GPU=$((NUM_SAMPLES / NUM_GPUS)) +echo "AutoHeuristic will collect ${NUM_SAMPLES} samples split across ${NUM_GPUS} GPUs" +echo "Each GPU will collect ${NUM_SAMPLES_PER_GPU}" + +# Function to create a new pane and run the script +create_pane() { + local device=$1 + tmux split-window -t "$SESSION_NAME" + tmux send-keys -t "$SESSION_NAME" "conda activate ${CONDA_ENV} && $PYTHON_COMMAND --device $device -o ${OUTPUT_DIR}/data_${device}.txt --num-samples ${NUM_SAMPLES_PER_GPU}" C-m +} + +# Create panes for each device number +for device in "${DEVICES[@]}"; do + create_pane ${device} +done + +# Remove the first pane (empty one) +tmux kill-pane -t "$SESSION_NAME.0" + +# Arrange panes in a tiled layout +tmux select-layout -t "$SESSION_NAME" tiled + +# Attach to the tmux session +tmux attach-session -t "$SESSION_NAME" diff --git a/torchgen/_autoheuristic/generate_heuristic.sh b/torchgen/_autoheuristic/generate_heuristic.sh new file mode 100644 index 00000000000..97696a43712 --- /dev/null +++ b/torchgen/_autoheuristic/generate_heuristic.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [ $# -lt 8 ]; then + echo "Error: This script requires exactly at least 8 arguments." + exit 1 +fi + +MODE=$1 +GPU_DEVICE_IDS=$2 +CONDA_ENV=$3 +NUM_SAMPLES=$4 +OUTPUT_DIR=$5 +HEURISTIC_NAME=$6 +BENCHMARK_SCRIPT=$7 +TRAIN_SCRIPT=$8 +EXTRA_TRAIN_ARGS=$9 + +mkdir -p ${OUTPUT_DIR} + +if [ "$MODE" = "collect" ]; then + # this will collect data for NUM_SAMPLES samples on the number of GPUs specified in GPU_DEVICE_IDS in parallel + bash ../collect_data.sh "python ${BENCHMARK_SCRIPT}" ${GPU_DEVICE_IDS} ${NUM_SAMPLES} ${CONDA_ENV} ${OUTPUT_DIR} +elif [ "$MODE" = "generate" ]; then + # the bash script above generates one separate txt file per GPU + # if GPU_DEVICE_IDS=6,7, it will generate "data_6.txt", "data_7.txt" inside OUTPUT_DIR + # these files have to be merged into a single file before we can use AutoHeuristic to learn a heuristic + OUTPUT_FILE="${OUTPUT_DIR}/${HEURISTIC_NAME}.txt" + INPUT_FILES=$(echo $GPU_DEVICE_IDS | tr ',' '\n' | sed "s|^|${OUTPUT_DIR}/data_|" | sed 's/$/.txt/') + python ../merge_data.py ${OUTPUT_FILE} ${INPUT_FILES} + + # This will learn a heuristic and generate the code into torch/_inductor/autoheuristic/artifacts/_${HEURISTIC_NAME}.py + python ${TRAIN_SCRIPT} ${OUTPUT_FILE} --heuristic-name ${HEURISTIC_NAME} ${EXTRA_TRAIN_ARGS} +else + echo "Error: Invalid mode ${MODE}. Please use 'collect' or 'generate'." + exit 1 +fi diff --git a/torchgen/_autoheuristic/merge_data.py b/torchgen/_autoheuristic/merge_data.py new file mode 100644 index 00000000000..374e77d6ed1 --- /dev/null +++ b/torchgen/_autoheuristic/merge_data.py @@ -0,0 +1,60 @@ +import sys +from typing import List + + +def merge_txt_files(file_list: List[str], output_file: str) -> None: + if not file_list: + print("No input files provided.") + return + + metadata: List[str] = [] + content: List[str] = [] + + # Read metadata and content from all files + for file_path in file_list: + try: + with open(file_path) as file: + lines = file.readlines() + if len(lines) < 2: + print( + f"Error: {file_path} does not have enough lines for metadata." + ) + return + + file_metadata = lines[:2] + file_content = lines[2:] + + if not metadata: + metadata = file_metadata + elif metadata != file_metadata: + print(f"Error: Metadata mismatch in {file_path}") + print("Expected metadata:") + print("".join(metadata)) + print(f"Metadata in {file_path}:") + print("".join(file_metadata)) + return + + content.extend(file_content) + except OSError as e: + print(f"Error reading file {file_path}: {e}") + return + + # Write merged content to output file + try: + with open(output_file, "w") as outfile: + outfile.writelines(metadata) + outfile.writelines(content) + print(f"Successfully merged files into {output_file}") + except OSError as e: + print(f"Error writing to output file {output_file}: {e}") + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print( + "Usage: python script.py output_file.txt input_file1.txt input_file2.txt ..." + ) + else: + output_file = sys.argv[1] + input_files = sys.argv[2:] + merge_txt_files(input_files, output_file) diff --git a/torchgen/_autoheuristic/mixed_mm/README.md b/torchgen/_autoheuristic/mixed_mm/README.md new file mode 100644 index 00000000000..b77cf659109 --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/README.md @@ -0,0 +1,16 @@ +If you just want to re-generate existing heuristics with already collected data for mixed_mm for A100/H100, run the following scripts: + +`bash get_mixedmm_dataset.sh # Downloads A100 and H100 datasets` +`bash gen_mixedmm_heuristic_a100.sh # Generates A100 heuristic` +`bash gen_mixedmm_heuristic_h100.sh # Generates H100 heuristic` + +If you want to collect new data, or generate a heuristic for another GPU, use the `generate_heuristic.sh` script: +First, go into the generate_heuristic.sh and modify the variables according to the comments. +Then run the script to perform benchmarks and collect training data: + +`bash generate_heuristic.sh collect` + +Depending on how many GPUs you are using, this might take a day. +Afterwards, run the script in order to learn the heuristic: + +`bash generate_heuristic.sh generate` diff --git a/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py b/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py new file mode 100644 index 00000000000..d9b7166630e --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py @@ -0,0 +1,146 @@ +# mypy: ignore-errors +import os +import random +import sys + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from typing import Any, Tuple + +from benchmark_runner import BenchmarkRunner # type: ignore[import-not-found] +from benchmark_utils import ( # type: ignore[import-not-found] + fits_in_memory, + get_mm_tensors, + get_random_between_pow2, +) + +import torch +from torch._inductor.utils import fresh_inductor_cache + + +class BenchmarkRunnerMixedMM(BenchmarkRunner): # type: ignore[misc, no-any-unimported] + """ + BenchmarkRunner for mixed mm. Used to generate collect training data with AutoHeuristic to learn a heuristic. + Currently, we are generating inputs with the following restrictions: + - m <= 128, and n and k >= 1024 (for these inputs one of the triton kernels wins in most cases) + - k % 256 == 0 (if k is not a multiple of the block size, this can have a huge negative impact on performance) + - mat1 not transposed + - mat2 transposed + This allows us to learn a heuristic that works well e.g. for gpt-fast. + """ + + def __init__(self) -> None: + super().__init__("mixed_mm") + + def create_input(self) -> Tuple[Any, ...]: + dtype1, dtype2 = self.get_dtypes() + m, k, n = self.get_m_k_n(dtype1) + transpose_left, transpose_right = False, True + return (m, k, n, transpose_left, transpose_right, dtype1, dtype2) + + def run_benchmark( + self, + m: int, + k: int, + n: int, + transpose_left: bool, + transpose_right: bool, + dtype_left: Any, + dtype_right: Any, + ) -> Any: + a, b = get_mm_tensors( + m, + k, + n, + transpose_left, + transpose_right, + dtype_left=dtype_left, + dtype_right=torch.float32, + ) + b = b.to(dtype=dtype_right) + + with fresh_inductor_cache(): + + def mixed_mm(A, B): + return torch.mm(A, B.to(A.dtype)) + + cf = torch.compile(mixed_mm, mode="max-autotune-no-cudagraphs") + cf(a, b) + torch.compiler.reset() + + def random_multiple_of_128(self, min_num=7, max_num=17): + ran_pow2 = random.randint(min_num, max_num - 1) + start = (2**ran_pow2) // 128 + end = (2 ** (ran_pow2 + 1)) // 128 + random_multiple = random.randint(start, end) + return random_multiple * 128 + + def get_random_pow2(self, min_power2: int, max_power2: int): + return 2 ** random.randint(min_power2, max_power2) + + def get_distr_type(self) -> str: + # 85%: choose a random multiple of 128 between 2^10 and 2^17 + # 10%: choose a random power of 2 between 2^10 and 2^17 favoring larger values + # 4%: choose a random number between 1024 and 131072 + # 1%: choose a random number between 2^i and 2^(i+1) with i in [10, 16] + return random.choices( + ["mult_128", "pow2", "uniform", "uniform-between-pow2"], + [0.85, 0.1, 0.04, 0.01], + )[0] + + def get_random_dim(self): + distr_type = self.get_distr_type() + if distr_type == "mult_128": + return self.random_multiple_of_128(min_num=10, max_num=17) + if distr_type == "pow2": + return self.get_random_pow2(min_power2=10, max_power2=17) + elif distr_type == "uniform-between-pow2": + return get_random_between_pow2(min_power2=10, max_power2=17) + elif distr_type == "uniform": + return random.randint(1024, 131072) + print(f"random_type {distr_type} not supported") + sys.exit(1) + + def get_random_num_small(self) -> int: + pow2 = random.choices([True, False], [0.75, 0.25])[0] + if pow2: + return 2 ** random.randint(1, 7) + else: + return get_random_between_pow2(1, 7) + + def get_m_k_n(self, dtype: Any) -> Tuple[int, int, int]: + numel_max = 2**31 + + # repeat until tensors fit in memory + while True: + m = self.get_random_num_small() + k = self.get_random_dim() + n = self.get_random_dim() + if k % 256 != 0: + continue + + assert k >= 1024 and n >= 1024, "k and n must be at least 1024" + + if m * k >= numel_max or m * n >= numel_max or k * n >= numel_max: + # autotuning will not happen for tensors that are this large + continue + + if fits_in_memory(dtype, m, k, n): + return (m, k, n) + + def get_dtypes(self) -> Any: + while True: + dtype_floats = [torch.float16, torch.bfloat16] + dtype_ints = [torch.int8, torch.uint8] + mat1_dtype = random.choices(dtype_floats)[0] + mat2_dtype = random.choices(dtype_ints)[0] + if mat1_dtype == torch.bfloat16 and mat2_dtype == torch.uint8: + # this combination seems to cause issues with mixed_mm + continue + return (mat1_dtype, mat2_dtype) + + +if __name__ == "__main__": + runner = BenchmarkRunnerMixedMM() + runner.run() diff --git a/torchgen/_autoheuristic/mixed_mm/gen_mixedmm_heuristic_a100.sh b/torchgen/_autoheuristic/mixed_mm/gen_mixedmm_heuristic_a100.sh new file mode 100644 index 00000000000..158a356354a --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/gen_mixedmm_heuristic_a100.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +data="mixedmm_a100_data.txt" + +python train_decision_mixedmm.py ${data} --heuristic-name MixedMMA100 diff --git a/torchgen/_autoheuristic/mixed_mm/gen_mixedmm_heuristic_h100.sh b/torchgen/_autoheuristic/mixed_mm/gen_mixedmm_heuristic_h100.sh new file mode 100644 index 00000000000..462f1836605 --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/gen_mixedmm_heuristic_h100.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +data="mixedmm_h100_data.txt" + +python train_decision_mixedmm.py ${data} --heuristic-name MixedMMH100 diff --git a/torchgen/_autoheuristic/mixed_mm/generate_heuristic_mixedmm.sh b/torchgen/_autoheuristic/mixed_mm/generate_heuristic_mixedmm.sh new file mode 100644 index 00000000000..dd6ac78e9df --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/generate_heuristic_mixedmm.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "Error: This script requires exactly one argument." + echo "`bash generate_heuristic_mixedmm.sh collect` to run benchmark and collect training data." + echo "`bash generate_heuristic_mixedmm.sh generate` to use the collected data to learn a heuristic." + exit 1 +fi + +MODE=$1 + +# !!! SPECIFY THE GPUs THAT YOU WANT TO USE HERE !!! +GPU_DEVICE_IDS="4,5" + +# !!! SPECIFY THE CONDA ENVIRONEMNT THAT YOU WANT TO BE ACTIVATED HERE !!! +CONDA_ENV=heuristic-pr + +NUM_SAMPLES=2000 + +# This is where AutoHeuristic will store autotuning results +OUTPUT_DIR="a100" + +# !!! CHANGE THE NAME OF THE HEURISTIC IF YOU WANT TO LEARN A HEURISTIC FOR A GPU THAT IS NOT A100 !!! +HEURISTIC_NAME="MixedMMA100" + +BENCHMARK_SCRIPT="gen_data_mixed_mm.py" + +TRAIN_SCRIPT="train_decision_mixedmm.py" + +bash ../generate_heuristic.sh ${MODE} ${GPU_DEVICE_IDS} ${CONDA_ENV} ${NUM_SAMPLES} ${OUTPUT_DIR} ${HEURISTIC_NAME} ${BENCHMARK_SCRIPT} ${TRAIN_SCRIPT} diff --git a/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh b/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh new file mode 100644 index 00000000000..fd50b2e79fb --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/' +a100_data='mixedmm_a100_data.zip' +h100_data='mixedmm_h100_data.zip' +datasets=("${a100_data}" "${h100_data}") +for dataset in "${datasets[@]}"; do + rm -f ${dataset} + url="${base_url}${dataset}" + wget ${url} + unzip -o ${dataset} + rm ${dataset} +done diff --git a/torchgen/_autoheuristic/mixed_mm/test_mixed_mm.py b/torchgen/_autoheuristic/mixed_mm/test_mixed_mm.py new file mode 100644 index 00000000000..205018fcd28 --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/test_mixed_mm.py @@ -0,0 +1,339 @@ +import os +import sys +import unittest + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from expecttest import TestCase + +from test_utils import read_file_to_string, run_bash # type: ignore[import-not-found] + + +class TestMixedMM(TestCase): + def test_mixedmm_a100(self) -> None: + run_bash("get_mixedmm_dataset.sh") + run_bash("gen_mixedmm_heuristic_a100.sh") + file_path = "../../../torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py" + a100_heuristic_generated_code = read_file_to_string(file_path) + + self.assertExpectedInline( + a100_heuristic_generated_code, + """\ +# flake8: noqa: B950 +# fmt: off +# This file was generated by AutoHeuristic. Do not modify it manually! +# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mixed_mm/ +from typing import List, Optional, Tuple + +from torch._inductor.autoheuristic.autoheuristic_utils import ( + AHContext, + AHMetadata, + Choice, +) +from torch._inductor.autoheuristic.learnedheuristic_interface import ( + LearnedHeuristicDecision, +) + + +class MixedMMA100(LearnedHeuristicDecision): + + def __init__(self) -> None: + self.choices: List[Choice] = [] + self.fill_choices() + + def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool: + return ( + metadata.name == self.get_name() + and metadata.shared_memory == 166912 + and str(metadata.device_capa) == "(8, 0)" + ) + + def get_confidence_threshold(self) -> float: + return 0.0 + + def get_choice(self, idx: int) -> Optional[str]: + if idx < len(self.choices): + return self.choices[idx] + return None + + def fill_choices(self) -> None: + self.choices.append('extern_fallback_mixed_mm') + self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=5_numwarps=8') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4') + self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4') + self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8') + + def get_name(self) -> str: + return 'mixed_mm' + + def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]: + if str(context.get_value('1LEQmLEQ16')) != 'True': + if context.get_value('m') <= 32.5: + if context.get_value('n') <= 6976.0: + if context.get_value('n') <= 3520.0: + if context.get_value('m*n') <= 37632.0: + return None + else: + return [(1.000, 13)] + else: + if context.get_value('m*k') <= 452352.0: + return [(0.590, 13), (0.256, 8), (0.103, 7), (0.051, 11)] + else: + return [(0.778, 8), (0.222, 13)] + else: + if context.get_value('k*n') <= 102776832.0: + if context.get_value('n') <= 14656.0: + return [(1.000, 11)] + else: + return [(0.889, 11), (0.111, 13)] + else: + return [(1.000, 11)] + else: + if context.get_value('m*n') <= 446464.0: + if context.get_value('m*n') <= 223424.0: + if context.get_value('mat1_stride_0') <= 3968.0: + return None + else: + return None + else: + if context.get_value('m*n') <= 346112.0: + return [(0.960, 16), (0.040, 7)] + else: + return [(0.750, 16), (0.136, 14), (0.114, 7)] + else: + if str(context.get_value('33LEQmLEQ64')) != 'True': + if context.get_value('n') <= 6976.0: + return [(1.000, 14)] + else: + return [(0.753, 2), (0.222, 1), (0.015, 7), (0.007, 16), (0.004, 12)] + else: + if context.get_value('n') <= 13888.0: + return [(0.710, 14), (0.275, 21), (0.014, 12)] + else: + return [(0.374, 19), (0.339, 20), (0.106, 21), (0.101, 16), (0.066, 17), (0.009, 14), (0.004, 18)] + else: + if context.get_value('n') <= 3520.0: + if context.get_value('arith_intensity') <= 3.994754433631897: + if str(context.get_value('mat2_dtype')) != 'torch.uint8': + if context.get_value('m*k') <= 18944.0: + return [(0.577, 5), (0.423, 6)] + else: + return [(0.988, 5), (0.012, 6)] + else: + if context.get_value('arith_intensity') <= 2.9899919033050537: + return None + else: + return None + else: + if context.get_value('arith_intensity') <= 7.956453561782837: + if context.get_value('k*n') <= 9244032.0: + return [(0.822, 5), (0.178, 6)] + else: + return [(0.977, 5), (0.023, 0)] + else: + if context.get_value('m*k') <= 978944.0: + return [(1.000, 5)] + else: + return [(0.971, 5), (0.029, 0)] + else: + if context.get_value('n') <= 13632.0: + if context.get_value('n') <= 6976.0: + return [(1.000, 6)] + else: + if context.get_value('k') <= 3968.0: + return [(0.617, 3), (0.111, 5), (0.099, 7), (0.086, 9), (0.062, 6), (0.025, 8)] + else: + return [(0.779, 8), (0.119, 5), (0.053, 7), (0.035, 6), (0.013, 3)] + else: + if context.get_value('k*n') <= 39518208.0: + return [(0.385, 4), (0.327, 3), (0.192, 6), (0.038, 7), (0.038, 10), (0.019, 5)] + else: + if context.get_value('n') <= 20800.0: + return [(0.821, 6), (0.121, 7), (0.029, 4), (0.014, 5), (0.007, 3), (0.007, 8)] + else: + return [(0.530, 7), (0.386, 6), (0.046, 8), (0.021, 3), (0.015, 4), (0.002, 5)] +""", + ) + + def test_mixedmm_h100(self) -> None: + run_bash("get_mixedmm_dataset.sh") + run_bash("gen_mixedmm_heuristic_h100.sh") + file_path = "../../../torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py" + h100_heuristic_generated_code = read_file_to_string(file_path) + + self.assertExpectedInline( + h100_heuristic_generated_code, + """\ +# flake8: noqa: B950 +# fmt: off +# This file was generated by AutoHeuristic. Do not modify it manually! +# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mixed_mm/ +from typing import List, Optional, Tuple + +from torch._inductor.autoheuristic.autoheuristic_utils import ( + AHContext, + AHMetadata, + Choice, +) +from torch._inductor.autoheuristic.learnedheuristic_interface import ( + LearnedHeuristicDecision, +) + + +class MixedMMH100(LearnedHeuristicDecision): + + def __init__(self) -> None: + self.choices: List[Choice] = [] + self.fill_choices() + + def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool: + return ( + metadata.name == self.get_name() + and metadata.shared_memory == 232448 + and str(metadata.device_capa) == "(9, 0)" + ) + + def get_confidence_threshold(self) -> float: + return 0.0 + + def get_choice(self, idx: int) -> Optional[str]: + if idx < len(self.choices): + return self.choices[idx] + return None + + def fill_choices(self) -> None: + self.choices.append('extern_fallback_mixed_mm') + self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=5_numwarps=8') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8') + self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4') + self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4') + self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4') + self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8') + + def get_name(self) -> str: + return 'mixed_mm' + + def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]: + if context.get_value('arith_intensity') <= 15.988086223602295: + if context.get_value('n') <= 25280.0: + if context.get_value('n') <= 1344.0: + if context.get_value('mat1_stride_0') <= 7808.0: + return [(0.581, 7), (0.419, 6)] + else: + if context.get_value('m*n') <= 7680.0: + return [(0.875, 0), (0.125, 6)] + else: + return [(0.833, 0), (0.167, 7)] + else: + if context.get_value('n') <= 8512.0: + if str(context.get_value('mat2_dtype')) != 'torch.int8': + return [(0.763, 6), (0.237, 7)] + else: + return [(0.725, 7), (0.275, 6)] + else: + if str(context.get_value('mat1_dtype')) != 'torch.bfloat16': + return [(0.736, 7), (0.197, 9), (0.048, 6), (0.014, 8), (0.005, 10)] + else: + return [(0.473, 7), (0.398, 6), (0.097, 9), (0.032, 10)] + else: + if context.get_value('n') <= 42254.0: + if context.get_value('n') <= 33856.0: + if context.get_value('k*n') <= 68157440.0: + return [(0.370, 4), (0.370, 5), (0.074, 7), (0.074, 8), (0.074, 11), (0.037, 6)] + else: + return [(0.916, 8), (0.036, 7), (0.036, 9), (0.012, 4)] + else: + return [(0.659, 5), (0.341, 6)] + else: + if context.get_value('k*n') <= 326052992.0: + if context.get_value('n') <= 55232.0: + return [(0.571, 6), (0.321, 7), (0.036, 4), (0.036, 8), (0.036, 9)] + else: + return [(0.506, 6), (0.325, 8), (0.104, 7), (0.039, 5), (0.026, 9)] + else: + if context.get_value('n') <= 57024.0: + return [(0.462, 9), (0.385, 7), (0.115, 6), (0.038, 8)] + else: + return [(0.598, 8), (0.223, 9), (0.107, 6), (0.071, 7)] + else: + if context.get_value('m*n') <= 543936.0: + if str(context.get_value('17LEQmLEQ32')) != 'True': + if context.get_value('m*n') <= 262272.0: + if context.get_value('n') <= 1592.5: + return [(0.860, 0), (0.140, 9)] + else: + return None + else: + if context.get_value('m*k') <= 1294336.0: + return [(0.833, 17), (0.150, 18), (0.017, 15)] + else: + return [(0.917, 17), (0.083, 8)] + else: + if context.get_value('n') <= 12416.0: + if context.get_value('m*n') <= 43008.0: + return None + else: + return [(0.853, 14), (0.147, 9)] + else: + return [(0.625, 12), (0.375, 14)] + else: + if context.get_value('m') <= 32.5: + if context.get_value('mat2_stride_1') <= 6656.0: + if context.get_value('n') <= 69184.0: + return [(0.611, 12), (0.361, 14), (0.028, 13)] + else: + return [(1.000, 12)] + else: + if context.get_value('mat2_stride_1') <= 20864.0: + return [(1.000, 12)] + else: + return [(0.958, 12), (0.042, 9)] + else: + if context.get_value('m*n') <= 1085440.0: + if context.get_value('n') <= 9152.0: + return [(1.000, 18)] + else: + return [(0.780, 18), (0.160, 16), (0.060, 20)] + else: + if context.get_value('m') <= 67.0: + return [(0.650, 16), (0.203, 19), (0.122, 18), (0.016, 20), (0.008, 1)] + else: + return [(0.561, 3), (0.185, 16), (0.096, 20), (0.083, 19), (0.076, 2)] +""", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py b/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py new file mode 100644 index 00000000000..df96f020dc6 --- /dev/null +++ b/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py @@ -0,0 +1,56 @@ +# mypy: ignore-errors +import os +import sys + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from train_decision import AHTrainDecisionTree + +from torch._inductor.autoheuristic.autoheuristic_utils import mixed_mm_operations + + +class AHTrainDecisionTreeMixedMM(AHTrainDecisionTree): + def __init__(self): + super().__init__() + + def add_new_features(self, results): + ops = mixed_mm_operations() + added_categorical_features = [] + for op in ops: + results[op.name] = results.apply(op.func, axis=1) + if op.is_categorical: + added_categorical_features.append(op.name) + return (results, added_categorical_features) + + def get_default_config(self, row): + return "extern_fallback_mixed_mm" + + def get_allowed_wrong_prediction_pct(self): + # it is okay to have wrong predictions + # we introduce uncertainty by marking leaves as unsafe instead + return 1.0 + + def get_test_and_val_size(self): + return (0.01, 0.19) + + def is_unsafe_leaf(self, row, predicted_config, choice2time): + if predicted_config not in choice2time: + # heuristic always returns "unsure" in such a case + return False + predicted_time = choice2time[predicted_config] + fallback_time = choice2time[self.get_default_config(row)] + # we mark leaves as unsafe if there is a chance our choice will be 5% slower than fallback + # we are okay with making the wrong choice, as long as our choice is better than fallback because + # fallback is the default when max_autotune is false + return 1.05 * fallback_time < predicted_time + + def get_grid_search_values(self): + # A lot of different hyperparameters perform very similar on mixed_mm + # it is kind of hard to automatically pick one so I just manually picked one with a small max_depth + return {"max_depth": [5], "min_samples_leaf": [0.01], "criterion": ["entropy"]} + + +if __name__ == "__main__": + train = AHTrainDecisionTreeMixedMM() + train.generate_heuristic() diff --git a/torchgen/_autoheuristic/mm/README.md b/torchgen/_autoheuristic/mm/README.md new file mode 100644 index 00000000000..3d1a23e656f --- /dev/null +++ b/torchgen/_autoheuristic/mm/README.md @@ -0,0 +1,32 @@ +If you just want to re-generate existing heuristics with already collected data for mm for A100/H100, run the following scripts: + +`bash get_mm_dataset.sh # Downloads A100 and H100 datasets` +`bash gen_heuristic_a100.sh # Generates A100 heuristic` +`bash gen_heuristic_h100.sh # Generates H100 heuristic` + +If you want to collect new data, or generate a heuristic for another GPU, use the `generate_heuristic_mm.sh` script: +First, go into the generate_heuristic_mm.sh and modify the variables according to the comments. Then, run the script to perform benchmarks and collect training data: + +`bash generate_heuristic.sh collect` + +This will collect training data on random inputs. Depending on how many GPUs you are using, this might take a day. +If you use multiple GPU, you will have one file per GPU, e.g. "data_6.txt", "data_7.txt" if you used GPUs with id 6 and 7. +To merge this into a single file run: +`python torchgen/_autuoheuristic/merge_data.py mm_train.txt data_6.txt data_7.txt` + +For mm, we also want to incorporate data from huggingface and TIMM models into the training data. + +To collect data for huggingface, run the following command: + +``` +TORCHINDUCTOR_AUTOHEURISTIC_USE="" TORCHINDUCTOR_AUTOHEURISTIC_COLLECT="mm" TORCHINDUCTOR_AUTOHEURISTIC_LOG_PATH="hf_train_mm.txt" TORCHINDUCTOR_MAX_AUTOTUNE=1 time python ../../../benchmarks/dynamo/huggingface.py --ci --performance --timing --explain --inductor --device cuda --train --amp +``` + +To collect data for TIMM models, run the following command +``` +TORCHINDUCTOR_AUTOHEURISTIC_USE="" TORCHINDUCTOR_AUTOHEURISTIC_COLLECT="mm" TORCHINDUCTOR_AUTOHEURISTIC_LOG_PATH="timm_train_mm.txt" TORCHINDUCTOR_MAX_AUTOTUNE=1 time python ../../../benchmarks/dynamo/timm_models.py --ci --performance --timing --explain --inductor --device cuda --train --amp +``` + +Afterwards, run the script in order to learn the heuristic: + +`bash generate_heuristic_mm.sh generate` diff --git a/torchgen/_autoheuristic/mm/gen_data_mm.py b/torchgen/_autoheuristic/mm/gen_data_mm.py new file mode 100644 index 00000000000..29911b6e378 --- /dev/null +++ b/torchgen/_autoheuristic/mm/gen_data_mm.py @@ -0,0 +1,122 @@ +import itertools +import os +import random +import sys + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from typing import Any, Tuple + +from benchmark_runner import BenchmarkRunner # type: ignore[import-not-found] +from benchmark_utils import ( # type: ignore[import-not-found] + fits_in_memory, + get_mm_tensors, + get_random_between_pow2, + set_precision, +) + +import torch +from torch._inductor.utils import fresh_inductor_cache + + +class BenchmarkRunnerMM(BenchmarkRunner): # type: ignore[misc, no-any-unimported] + """ + BenchmarkRunner for mm. + """ + + def __init__(self) -> None: + super().__init__("mm") + + def create_input(self) -> Tuple[Any, ...]: + dtype = random.choices([torch.float32, torch.float16, torch.bfloat16])[0] + set_precision(dtype) + m, k, n = self.get_m_k_n(dtype) + return (m, k, n, dtype) + + def run_benchmark( + self, + m: int, + k: int, + n: int, + dtype: Any, + ) -> Any: + # for a given shape, test all possible combinations of transpose_left and transpose_right + for transpose_left, transpose_right in itertools.product( + [False, True], repeat=2 + ): + print( + f"m: {m}, k: {k}, n: {n}, transpose_left: {transpose_left}, transpose_right: {transpose_right}, dtype: {dtype}" + ) + a, b = get_mm_tensors( + m, + k, + n, + transpose_left, + transpose_right, + dtype_left=dtype, + dtype_right=dtype, + ) + + with fresh_inductor_cache(): + + def mixed_mm(A: Any, B: Any) -> Any: + return torch.mm(A, B) + + cf = torch.compile(mixed_mm, mode="max-autotune-no-cudagraphs") + cf(a, b) + torch.compiler.reset() + + def random_multiple_of_128(self, min_num: int = 7, max_num: int = 17) -> int: + # generates a random number ran_pow2 between min_num and max_num -1 + # and returns a random multiple of 128 between 2^ran_pow2 and 2^(ran_pow2+1) + ran_pow2 = random.randint(min_num, max_num - 1) + start = (2**ran_pow2) // 128 + end = (2 ** (ran_pow2 + 1)) // 128 + random_multiple = random.randint(start, end) + return random_multiple * 128 + + def get_distr_type(self) -> str: + # 85%: choose a random multiple of 128 between 2^10 and 2^17 + # 10%: choose a random power of 2 between 2^0 and 2^17 + # 4%: choose a random number between 1 and 131072 + # 1%: choose a random number between 2^i and 2^(i+1) with i in [1, 16] + return random.choices( + ["mult_128", "pow2", "uniform", "uniform-between-pow2"], + [0.85, 0.1, 0.04, 0.01], + )[0] + + def get_random_dim(self) -> int: + distr_type = self.get_distr_type() + if distr_type == "mult_128": + return self.random_multiple_of_128(min_num=10, max_num=17) + if distr_type == "pow2": + return int(2 ** random.randint(0, 17)) + elif distr_type == "uniform-between-pow2": + # TODO(AlnisM): make mypy work for torchgen/_autoheuristic/ + return int(get_random_between_pow2(min_power2=1, max_power2=17)) + elif distr_type == "uniform": + return random.randint(1, 131072) + print(f"random_type {distr_type} not supported") + sys.exit(1) + + def get_m_k_n(self, dtype: Any) -> Tuple[int, int, int]: + numel_max = 2**31 + + # repeat until tensors fit in memory + while True: + m = self.get_random_dim() + k = self.get_random_dim() + n = self.get_random_dim() + + if m * k >= numel_max or m * n >= numel_max or k * n >= numel_max: + # autotuning will not happen for tensors that are this large + continue + + if fits_in_memory(dtype, m, k, n): + return (m, k, n) + + +if __name__ == "__main__": + runner = BenchmarkRunnerMM() + runner.run() diff --git a/torchgen/_autoheuristic/mm/gen_heuristic_a100.sh b/torchgen/_autoheuristic/mm/gen_heuristic_a100.sh new file mode 100644 index 00000000000..595bbcb068c --- /dev/null +++ b/torchgen/_autoheuristic/mm/gen_heuristic_a100.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +dir="a100/" +data="a100_mm.txt" +python train_decision_mm.py ${dir}a100_mm.txt --heuristic-name MMRankingA100 --ranking 10 --save-dot --data train_timm ${dir}a100_timm_train_mm.txt --data train_hf ${dir}a100_hf_train_mm.txt diff --git a/torchgen/_autoheuristic/mm/gen_heuristic_h100.sh b/torchgen/_autoheuristic/mm/gen_heuristic_h100.sh new file mode 100644 index 00000000000..14eaccccb62 --- /dev/null +++ b/torchgen/_autoheuristic/mm/gen_heuristic_h100.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +dir="h100/" +data="h100_mm.txt" +python train_decision_mm.py ${dir}h100_mm.txt --heuristic-name MMRankingH100 --ranking 10 --save-dot --data train_timm ${dir}h100_timm_train_mm.txt --data train_hf ${dir}h100_hf_train_mm.txt diff --git a/torchgen/_autoheuristic/mm/get_mm_dataset.sh b/torchgen/_autoheuristic/mm/get_mm_dataset.sh new file mode 100644 index 00000000000..7461dec41dd --- /dev/null +++ b/torchgen/_autoheuristic/mm/get_mm_dataset.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/' +a100_data='a100_mm.zip' +h100_data='h100_mm.zip' +datasets=("${a100_data}" "${h100_data}") +for dataset in "${datasets[@]}"; do + url="${base_url}${dataset}" + wget ${url} + unzip ${dataset} + rm ${dataset} +done diff --git a/torchgen/_autoheuristic/mm/train_decision_mm.py b/torchgen/_autoheuristic/mm/train_decision_mm.py new file mode 100644 index 00000000000..945dcc98561 --- /dev/null +++ b/torchgen/_autoheuristic/mm/train_decision_mm.py @@ -0,0 +1,64 @@ +# mypy: ignore-errors +import os +import sys + +import pandas as pd # type: ignore[import-untyped] + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from train_decision import AHTrainDecisionTree + +from torch._inductor.autoheuristic.autoheuristic_utils import mm_operations + + +class AHTrainDecisionTreeMM(AHTrainDecisionTree): + def __init__(self): + super().__init__() + + def add_new_features(self, results): + ops = mm_operations() + added_categorical_features = [] + for op in ops: + results[op.name] = results.apply(op.func, axis=1) + if op.is_categorical: + added_categorical_features.append(op.name) + return (results, added_categorical_features) + + def get_default_config(self, row): + return "extern_mm" + + def get_allowed_wrong_prediction_pct(self): + return 1.0 + + def get_test_and_val_size(self): + return (0.01, 0.19) + + def get_grid_search_values(self): + return {"max_depth": [5], "min_samples_leaf": [0.01], "criterion": ["entropy"]} + + def add_training_data(self, df_train, datasets): + # add each dataset to the training data 3 times + # we really want to make sure that the heuristic performs well on these datasets + df_timm_train = datasets["train_timm"] + df_timm_train = df_timm_train.loc[df_timm_train.index.repeat(3)].reset_index( + drop=True + ) + df_hf_train = datasets["train_hf"] + df_hf_train = df_hf_train.loc[df_hf_train.index.repeat(3)].reset_index( + drop=True + ) + df_train = datasets["train"] + df_train = pd.concat( + [df_train, df_timm_train, df_hf_train], + ignore_index=True, + ) + return df_train + + def ranking_always_included_choices(self): + return ["extern_mm"] + + +if __name__ == "__main__": + train = AHTrainDecisionTreeMM() + train.generate_heuristic() diff --git a/torchgen/_autoheuristic/pad_mm/README.md b/torchgen/_autoheuristic/pad_mm/README.md new file mode 100644 index 00000000000..0383d9b00b6 --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/README.md @@ -0,0 +1,14 @@ +If you just want to re-generate existing heuristics with already collected data for pad_mm for A100, run the following scripts: + +`bash get_padmm_dataset.sh # Downloads A100` +`bash gen_pad_mm_a100.sh # Generates A100 heuristic` + +If you want to collect new data, or generate a heuristic for another GPU, use the `generate_heuristic_pad_mm.sh` script: +First, go into the generate_heuristic_mm.sh and modify the variables according to the comments. Then, run the script to perform benchmarks and collect training data: + +`bash generate_heuristic_pad_mm.sh collect` + +This will collect training data on random inputs. Depending on how many GPUs you are using, this might take a day. +Afterwards, run the script in order to learn the heuristic: + +`bash generate_heuristic_pad_mm.sh generate` diff --git a/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py b/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py new file mode 100644 index 00000000000..c3ca2da7166 --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py @@ -0,0 +1,149 @@ +import os +import random +import sys + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from typing import Any, Tuple + +from benchmark_runner import BenchmarkRunner # type: ignore[import-not-found] +from benchmark_utils import ( # type: ignore[import-not-found] + fits_in_memory, + get_mm_tensors, + set_precision, + transpose_tensors, +) + +import torch +from torch._inductor.fx_passes.pad_mm import ( # type: ignore[import-not-found] + get_alignment_size_dtype, +) +from torch._inductor.utils import fresh_inductor_cache + + +class BenchmarkRunnerPadMM(BenchmarkRunner): # type: ignore[misc, no-any-unimported] + """ + BenchmarkRunner for pad_mm. Used to generate collect training data with AutoHeuristic to learn a heuristic. + """ + + def __init__(self) -> None: + super().__init__("pad_mm") + + def create_input(self) -> Tuple[Any, ...]: + dtype = self.get_dtype() + set_precision(dtype) + m, k, n = self.get_m_k_n(dtype) + + (transpose_left, transpose_right) = transpose_tensors() + prepadded_left = self.prepadded() + prepadded_right = self.prepadded() + return ( + m, + k, + n, + transpose_left, + transpose_right, + dtype, + prepadded_left, + prepadded_right, + ) + + def run_benchmark( + self, + m: int, + k: int, + n: int, + transpose_left: bool, + transpose_right: bool, + dtype: Any, + prepadded_left: bool, + prepadded_right: bool, + ) -> None: + a, b = get_mm_tensors( + m, + k, + n, + transpose_left, + transpose_right, + dtype_left=dtype, + dtype_right=dtype, + ) + + print("Benchmarking the following input:") + print(f"m={m} k={k} n={n} dtype={dtype}") + print(f"transpose_left={transpose_left} transpose_right={transpose_right}") + print(f"prepadded_left={prepadded_left} prepadded_right={prepadded_right}") + + with fresh_inductor_cache(): + + def mm(a: Any, b: Any) -> Any: + return torch.mm(a, b) + + def mm_mat1_prepadded(a: Any, b: Any) -> Any: + return torch.mm(a + 1, b) + + def mm_mat2_prepadded(a: Any, b: Any) -> Any: + return torch.mm(a, b + 1) + + def mm_mat1_mat2_prepadded(a: Any, b: Any) -> Any: + return torch.mm(a + 1, b + 1) + + if prepadded_left and prepadded_right: + cf = torch.compile(mm_mat1_mat2_prepadded) + elif prepadded_left: + cf = torch.compile(mm_mat1_prepadded) + elif prepadded_right: + cf = torch.compile(mm_mat2_prepadded) + else: + cf = torch.compile(mm) + cf(a, b) + torch.compiler.reset() + + def get_random_dim( + self, min_power2: int = 1, max_power2: int = 16, p_unaligned: float = 0.25 + ) -> int: + aligned = random.choices([True, False], [1 - p_unaligned, p_unaligned])[0] + if aligned: + return 2 ** random.randint(min_power2, max_power2) # type: ignore[no-any-return] + else: + # choose a random number between 2^i and 2^(i+1) + return self.get_random_between_pow2(min_power2, max_power2) # type: ignore[no-any-return] + + def is_aligned(self, dim: int, align_size: int) -> bool: + return dim % align_size == 0 + + def get_m_k_n(self, dtype: Any) -> Tuple[int, int, int]: + uniform = random.choices([True, False])[0] + align_size = get_alignment_size_dtype(dtype) + + # repeat until tensors fit in memory + while True: + if uniform: + m = random.randint(1, 65536) + k = random.randint(1, 65536) + n = random.randint(1, 65536) + else: + m = self.get_random_dim() + k = self.get_random_dim() + n = self.get_random_dim() + + if all(self.is_aligned(dim, align_size) for dim in [m, k, n]): + # skip if already aligned + continue + + if fits_in_memory(dtype, m, k, n): + return (m, k, n) + + def prepadded(self, p_prepadded: float = 0.2) -> bool: + # p_prepadded: probability that a tensor is "prepadded", i.e. pad_mm excludes time it takes to pad from benchmarking + return random.choices([True, False], [p_prepadded, 1 - p_prepadded])[0] + + def get_dtype(self) -> Any: + dtype_choices = [torch.float16, torch.bfloat16, torch.float32] + return random.choices(dtype_choices)[0] + + +if __name__ == "__main__": + runner = BenchmarkRunnerPadMM() + runner.run() diff --git a/torchgen/_autoheuristic/pad_mm/gen_pad_mm_a100.sh b/torchgen/_autoheuristic/pad_mm/gen_pad_mm_a100.sh new file mode 100644 index 00000000000..e1f9a60bae5 --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/gen_pad_mm_a100.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +data="pad_mm_a100_data.txt" + +python train_regression_pad_mm.py ${data} --heuristic-name PadMMA100 diff --git a/torchgen/_autoheuristic/pad_mm/gen_pad_mm_h100.sh b/torchgen/_autoheuristic/pad_mm/gen_pad_mm_h100.sh new file mode 100644 index 00000000000..eff8cb98204 --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/gen_pad_mm_h100.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +data="pad_mm_h100_data.txt" + +python train_regression_pad_mm.py ${data} --heuristic-name PadMMH100 diff --git a/torchgen/_autoheuristic/pad_mm/generate_heuristic_pad_mm.sh b/torchgen/_autoheuristic/pad_mm/generate_heuristic_pad_mm.sh new file mode 100644 index 00000000000..d7cb6b99164 --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/generate_heuristic_pad_mm.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "Error: This script requires exactly one argument." + echo "`bash generate_heuristic_pad_mm.sh collect` to run benchmark and collect training data." + echo "`bash generate_heuristic_pad_mm.sh generate` to use the collected data to learn a heuristic." + exit 1 +fi + +MODE=$1 + +# !!! SPECIFY THE GPUs THAT YOU WANT TO USE HERE !!! +GPU_DEVICE_IDS="4,5" + +# !!! SPECIFY THE CONDA ENVIRONEMNT THAT YOU WANT TO BE ACTIVATED HERE !!! +CONDA_ENV=heuristic-pr + +NUM_SAMPLES=2000 + +# This is where AutoHeuristic will store autotuning results +OUTPUT_DIR="a100" + +# !!! CHANGE THE NAME OF THE HEURISTIC IF YOU WANT TO LEARN A HEURISTIC FOR A GPU THAT IS NOT A100 !!! +HEURISTIC_NAME="PadMMA100" + +BENCHMARK_SCRIPT="gen_data_pad_mm.py" + +TRAIN_SCRIPT="train_regression_pad_mm.py" + +bash ../generate_heuristic.sh ${MODE} ${GPU_DEVICE_IDS} ${CONDA_ENV} ${NUM_SAMPLES} ${OUTPUT_DIR} ${HEURISTIC_NAME} ${BENCHMARK_SCRIPT} ${TRAIN_SCRIPT} diff --git a/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh b/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh new file mode 100644 index 00000000000..b8ab60d943e --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +a100_zip="pad_mm_a100_data.zip" +a100_data="https://github.com/AlnisM/autoheuristic-datasets/raw/main/${a100_zip}" +rm -f ${a100_zip} +wget ${a100_data} +unzip -o ${a100_zip} +rm ${a100_zip} diff --git a/torchgen/_autoheuristic/pad_mm/test_pad_mm.py b/torchgen/_autoheuristic/pad_mm/test_pad_mm.py new file mode 100644 index 00000000000..6469a6cd37d --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/test_pad_mm.py @@ -0,0 +1,137 @@ +import os +import sys +import unittest + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from expecttest import TestCase + +from test_utils import read_file_to_string, run_bash # type: ignore[import-not-found] + + +class TestPadMM(TestCase): + def test_padmm_a100(self) -> None: + run_bash("get_padmm_dataset.sh") + run_bash("gen_pad_mm_a100.sh") + file_path = "../../../torch/_inductor/autoheuristic/artifacts/_PadMMA100.py" + a100_heuristic_generated_code = read_file_to_string(file_path) + + self.assertExpectedInline( + a100_heuristic_generated_code, + """\ +# flake8: noqa: B950 +# fmt: off +# This file was generated by AutoHeuristic. Do not modify it manually! +# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/pad_mm/ +from torch._inductor.autoheuristic.autoheuristic_utils import AHContext, AHMetadata, Choice, CHOICE_COL +from torch._inductor.autoheuristic.learnedheuristic_interface import ( + LearnedHeuristicRegression, +) + + +class PadMMA100(LearnedHeuristicRegression): + + def __init__(self) -> None: + pass + + def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool: + return ( + metadata.name == self.get_name() + and metadata.shared_memory == 166912 + and str(metadata.device_capa) == "(8, 0)" + ) + + def get_feedback(self, context: AHContext, choice: Choice) -> float: + context.context_dict[CHOICE_COL] = choice + return self.predict(context) + + def get_confidence_threshold(self) -> float: + return 1.7025303314066 + + def get_name(self) -> str: + return 'pad_mm' + + def predict(self, context: AHContext) -> float: + if str(context.get_value('choice')) != 'pad': + if str(context.get_value('using_tf32')) != 'False': + if context.get_value('m*n') <= 4171264.0: + if context.get_value('m*k') <= 3999308.0: + return 1.8751469764071178 + else: + if str(context.get_value('n_multiple_32')) != 'True': + return 0.9117231355626345 + else: + return 1.1607689608873861 + else: + if str(context.get_value('n_multiple_2')) != 'True': + if str(context.get_value('using_tf32')) != 'True': + return 0.7430382200435992 + else: + return 0.8531269794448678 + else: + if str(context.get_value('k_multiple_2')) != 'True': + return 0.7577181972719917 + else: + return 0.8977349440424219 + else: + if context.get_value('m*n') <= 1299712.0: + return 1.1669723418995592 + else: + if context.get_value('mat2_stride_1') <= 45217.5: + if context.get_value('m*n') <= 55884158.0: + return 1.0262769936909601 + else: + return 1.0022677428470845 + else: + if context.get_value('m') <= 18478.0: + return 1.1127066261894312 + else: + return 1.0337740659894263 + else: + if str(context.get_value('mat1_dtype')) != 'torch.float32': + if str(context.get_value('n_multiple_2')) != 'False': + if str(context.get_value('k_multiple_2')) != 'True': + if context.get_value('mat1_stride_0') <= 561.0: + return 1.2900382135142956 + else: + return 1.5761737616057887 + else: + if context.get_value('num_dims_needs_padding') <= 1.5: + return 1.0472263310239422 + else: + return 1.1727673465762514 + else: + if context.get_value('k') <= 28238.5: + if context.get_value('k/(m*n)') <= 0.00026227018679492176: + return 1.6770542505397175 + else: + return 1.3974785435105923 + else: + if str(context.get_value('mat1_dtype')) != 'torch.bfloat16': + return 1.3952699800111992 + else: + return 1.5759286511628336 + else: + if str(context.get_value('using_tf32')) != 'False': + if context.get_value('m*n') <= 14119424.0: + return 0.8875772670422478 + else: + if str(context.get_value('mat2_innermost_needs_padding')) != 'True': + return 1.1467728924377265 + else: + return 1.215842963532998 + else: + if context.get_value('arith_intensity') <= 396.8774871826172: + return 0.89940161869551 + else: + if context.get_value('mat2_stride_1') <= 45217.5: + return 0.9964328169353532 + else: + return 0.9493479238294826 +""", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py b/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py new file mode 100644 index 00000000000..9ed37b7a00d --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py @@ -0,0 +1,27 @@ +# mypy: ignore-errors +import os +import sys + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from train_decision import AHTrainDecisionTree + +from torch._inductor.autoheuristic.autoheuristic_utils import pad_mm_operations + + +class AHTrainDecisionTreePadMM(AHTrainDecisionTree): + def __init__(self): + super().__init__() + + def add_new_features(self, results): + ops = pad_mm_operations() + for op in ops: + results[op.name] = results.apply(op.func, axis=1) + added_categorical_features = [op.name for op in ops if op.is_categorical] + return (results, added_categorical_features) + + +if __name__ == "__main__": + train = AHTrainDecisionTreePadMM() + train.generate_heuristic() diff --git a/torchgen/_autoheuristic/pad_mm/train_pad_mm.py b/torchgen/_autoheuristic/pad_mm/train_pad_mm.py new file mode 100644 index 00000000000..ab60c44dac0 --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/train_pad_mm.py @@ -0,0 +1,27 @@ +# mypy: ignore-errors +import os +import sys + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from train_regression import AHTrainRegressionTree + +from torch._inductor.fx_passes.pad_mm import pad_mm_operations + + +class AHTrainPadMM(AHTrainRegressionTree): + def __init__(self) -> None: + super().__init__() + + def add_new_features(self, results): + ops = pad_mm_operations() + for op in ops: + results[op.name] = results.apply(op.func, axis=1) + added_categorical_features = [op.name for op in ops if op.is_categorical] + return (results, added_categorical_features) + + +if __name__ == "__main__": + train = AHTrainPadMM() + train.generate_heuristic() diff --git a/torchgen/_autoheuristic/pad_mm/train_regression_pad_mm.py b/torchgen/_autoheuristic/pad_mm/train_regression_pad_mm.py new file mode 100644 index 00000000000..e9cdbf517e0 --- /dev/null +++ b/torchgen/_autoheuristic/pad_mm/train_regression_pad_mm.py @@ -0,0 +1,27 @@ +# mypy: ignore-errors +import os +import sys + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from train_regression import AHTrainRegressionTree + +from torch._inductor.fx_passes.pad_mm import pad_mm_operations + + +class AHTrainPadMM(AHTrainRegressionTree): + def __init__(self): + super().__init__() + + def add_new_features(self, results): + ops = pad_mm_operations() + for op in ops: + results[op.name] = results.apply(op.func, axis=1) + added_categorical_features = [op.name for op in ops if op.is_categorical] + return (results, added_categorical_features) + + +if __name__ == "__main__": + train = AHTrainPadMM() + train.generate_heuristic() diff --git a/torchgen/_autoheuristic/requirements.txt b/torchgen/_autoheuristic/requirements.txt new file mode 100644 index 00000000000..fda4dd66e50 --- /dev/null +++ b/torchgen/_autoheuristic/requirements.txt @@ -0,0 +1,2 @@ +pandas +scikit-learn diff --git a/torchgen/_autoheuristic/test.sh b/torchgen/_autoheuristic/test.sh new file mode 100644 index 00000000000..ed5e1a3b12a --- /dev/null +++ b/torchgen/_autoheuristic/test.sh @@ -0,0 +1,7 @@ +# you should run these tests whenever you make changes to any of the train*.py files within this directory +# running these tests takes around 10 minutes on my machine +cd mixed_mm +python test_mixed_mm.py +cd ../pad_mm +python test_pad_mm.py +cd ../ diff --git a/torchgen/_autoheuristic/test_utils.py b/torchgen/_autoheuristic/test_utils.py new file mode 100644 index 00000000000..0efbc458c26 --- /dev/null +++ b/torchgen/_autoheuristic/test_utils.py @@ -0,0 +1,19 @@ +import subprocess + + +def read_file_to_string(file_path: str) -> str: + with open(file_path) as file: + return file.read() + + +def run_bash(bash_script_path: str) -> None: + try: + print("Executing: ", bash_script_path) + result = subprocess.run( + ["bash", bash_script_path], capture_output=True, text=True, check=True + ) + # Print the output + print(f"Output of {bash_script_path}: {result.stdout}") + except subprocess.CalledProcessError as e: + print(f"An error occurred executing {bash_script_path}: {e}") + print("Error output:", e.stderr) diff --git a/torchgen/_autoheuristic/train.py b/torchgen/_autoheuristic/train.py new file mode 100644 index 00000000000..4e8dd330a13 --- /dev/null +++ b/torchgen/_autoheuristic/train.py @@ -0,0 +1,179 @@ +# mypy: ignore-errors + +import argparse +import json +import warnings + +import pandas as pd # type: ignore[import-untyped] + +from torch._inductor.autoheuristic.autoheuristic_utils import ( + CHOICE_COL, + get_metadata_str_from_log, +) + + +# TODO (AlnisM): Fix these warnings +warnings.filterwarnings( + "ignore", + message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", +) +warnings.filterwarnings( + "ignore", + message="DataFrameGroupBy.apply operated on the grouping columns.", +) + + +class AHTrain: + """ + Base class for AutoHeuristic training. + """ + + def __init__(self) -> None: + self.parser = argparse.ArgumentParser() + self.add_base_arguments() + self.args = None + + def add_base_arguments(self): + self.parser.add_argument( + "dataset", + type=str, + help="Path to text file containing data collected with AutoHeuristic.", + ) + self.parser.add_argument( + "--nrows", + type=int, + default=None, + help="Only read first n rows of the dataset.", + ) + self.parser.add_argument( + "--heuristic-name", + type=str, + default="learned_heuristic", + help="Name of the heuristic to be generated.", + ) + self.parser.add_argument( + "--data", + nargs=2, + action="append", + metavar=("TYPE", "PATH"), + help="Specify name of datasets and file paths to be evaluated.", + ) + self.parser.add_argument( + "--save-dot", + action="store_true", + help="Export heuristic to graphviz dot.", + ) + self.parser.add_argument( + "--ranking", + type=int, + default=None, + help=""" + Makes AutoHeuristic learn a heuristic that ranks choices instead of predicting a single choice. + The argument is the number of choices the heuristic will provide. + """, + ) + + def parse_args(self): + return self.parser.parse_args() + + def parse_log(self, log_path, nrows=None): + (df, metadata) = self.deserialize_data(log_path) + numerical_features = metadata["numerical_features"] + categorical_features = metadata["categorical_features"] + choices = df[CHOICE_COL].unique().tolist() + features = numerical_features + categorical_features + if nrows is not None: + df = df.head(nrows) + df = self.filter_df(df) + return (df, metadata, features, categorical_features, choices) + + def generate_heuristic(self): + self.args = self.parse_args() + self.main( + self.args.dataset, + self.args.data, + self.args.nrows, + self.args.heuristic_name, + self.args.save_dot, + self.args.ranking is not None, + ) + + def filter_df(self, df): + return df + + def add_new_features(self, results): + return (results, []) + + def add_real_datasets(self, datasets, other_datasets, cat_feature2cats): + if other_datasets: + for name, path in other_datasets: + (df_other, choices, _, _, _) = self.get_df( + path, cat_feature2cats=cat_feature2cats, apply_filters=False + ) + datasets[name] = df_other + + def handle_categorical_features( + self, cat_feature2cats, categorical_features, results + ): + # Doing this here because if we create another df for testing purposes + # and that other df does not contain all categories for a categorical feature, + # pd.dummies will not create columns for the missing categories + if not cat_feature2cats: + cat_feature2cats = {} + for cat_feature in categorical_features: + if cat_feature in cat_feature2cats: + categories = cat_feature2cats[cat_feature] + else: + categories = results[cat_feature].unique() + cat_feature2cats[cat_feature] = categories + results[cat_feature] = pd.Categorical( + results[cat_feature], categories=categories + ) + + dummy_col_2_col_val = {} + for col in categorical_features: + unique_vals = results[col].unique() + for val in unique_vals: + dummy_col_2_col_val[f"{col}_{val}"] = (col, val) + # one-hot encode categorical features + results = pd.get_dummies(results, columns=categorical_features) + return (results, cat_feature2cats, dummy_col_2_col_val) + + def gen_precondition(self, opt_name, shared_memory, device_capa): + return f""" def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool: + return ( + metadata.name == self.get_name() + and metadata.shared_memory == {shared_memory} + and str(metadata.device_capa) == "{device_capa}" + )""" + + def codegen_boilerplate( + self, heuristic_name, opt_name, threshold, shared_memory, device_capa, dt + ): + pass + + def gen_predict_fn_def(self): + pass + + def write_heuristic_to_file(self, lines, heuristic_name): + output_file = ( + f"../../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py" + ) + path = f"{output_file}" + with open(path, "w") as f: + f.write("\n".join(lines) + "\n") + + def deserialize_data(self, log_path): + json_string = get_metadata_str_from_log(log_path) + metadata = self.deserialize_metadata(json_string) + + df = pd.read_csv(log_path, skiprows=1, on_bad_lines="skip") + return (df, metadata) + + def deserialize_metadata(self, json_string): + return json.loads(json_string) + + +if __name__ == "__main__": + train = AHTrain() + train.generate_heuristic() diff --git a/torchgen/_autoheuristic/train_decision.py b/torchgen/_autoheuristic/train_decision.py new file mode 100644 index 00000000000..31cc7632fac --- /dev/null +++ b/torchgen/_autoheuristic/train_decision.py @@ -0,0 +1,936 @@ +# mypy: ignore-errors + +import itertools +import json +import logging +import math +import warnings + + +warnings.filterwarnings( + "ignore", + message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", +) + +from dataclasses import dataclass + +import numpy as np +import pandas as pd # type: ignore[import-untyped] +from ah_tree import DecisionTree +from scipy.stats import gmean +from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeClassifier +from train import AHTrain + + +log = logging.getLogger(__name__) +DEBUG = True +if DEBUG: + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter( + "%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) + ch.setFormatter(formatter) + log.addHandler(ch) + + +class AHTrainDecisionTree(AHTrain): + def __init__(self): + super().__init__() + + def debug_time(self, row, top_k_choices): + choices_feedback = json.loads(row["choice2time"]) + timings = sorted(choices_feedback.items(), key=lambda x: x[1]) + for choice, time in timings: + result = f"{choice} {time}" + if choice in top_k_choices: + result += " TOPK" + print(result) + + def is_unsafe_leaf(self, row, predicted_config, choice2time): + """ + Can be overridden by subclasses to define their own logic for deciding when a leaf is unsafe. Returns a sample + that landed in the leaf, the choice predicted by the tree, and a dictionary that maps each choice to the + execution time. One can for example decide to mark a leaf as unsafe if the predicted choice is 2x slower + than the fastest choice. + If a leaf is unsafe, the learned heuristic will always return 'unsure' if an input lands in that leaf. + """ + + return False + + def get_unsafe_leaves(self, model, df, feature_columns): + """ + Given a trained decision tree, and a dataframe containing the training data, returns a list of unsafe leaves. + """ + X = df[feature_columns] + y = df["winner"] + leaf_ids = model.apply(X) + unique_leaves = np.unique(leaf_ids) + + unsafe_leaves = [] + # Iterate over each leaf + for leaf in unique_leaves: + leaf_mask = leaf_ids == leaf + # Get samples that land in this leaf + leaf_X = X[leaf_mask] + + predicted_config = model.predict(leaf_X.iloc[[0]])[0] + + # For each sample, check if we should mark the leaf as unsafe + for idx, row in leaf_X.iterrows(): + choice2time = json.loads(df.loc[idx, "choice2time"]) + if self.is_unsafe_leaf(row, predicted_config, choice2time): + unsafe_leaves.append(leaf) + break + return unsafe_leaves + + def get_allowed_wrong_prediction_pct(self): + """ + This is used to determine a threshold for when a learned heuristic returns 'unsure'. + If this function returns 0.01, we will set the probability required for the decision tree to return a decision + such that at most 1% of the predictions will be wrong on the validation set. + """ + return 0.01 + + def get_grid_search_values(self): + """ + Standard values for grid search. Can be overriden. + """ + return { + "max_depth": [5, 6, 7], + "min_samples_leaf": [1, 5, 10, 0.01, 0.05, 0.02], + "criterion": ["gini", "entropy"], + } + + def predict(self, model, df, feature_columns): + """ + Returns the predictions, probabilities, and leaf ids for a given dataframe. + """ + predictions = model.predict(df[feature_columns]) + proba = model.predict_proba(df[feature_columns]) + leaf_ids = model.apply(df[feature_columns]) + return predictions, proba, leaf_ids + + def ranking_num_choices(self): + # if the heuristic is used for ranking, this function returns the number + # of choices that the heuristic will return + if self.args.ranking is None: + return 5 + return self.args.ranking + + def train_and_evaluate_models( + self, + datasets, + max_depths, + min_samples_leafs, + criterion_list, + feature_columns, + ranking=False, + ): + """ + Does a grid search over max_depths, min_samples_leafs, and criterion_list and returns the best model. + """ + + results = [] + best_model = None + best_model_safe_proba = 0 + best_model_num_correct = 0 + best_model_num_wrong = 0 + best_model_unsafe_leaves = [] + columns = ["set", "crit", "max_depth", "min_samples_leaf"] + metrics_columns = [] + for max_depth, min_samples_leaf, criterion in itertools.product( + max_depths, min_samples_leafs, criterion_list + ): + print( + f"max_depth={max_depth} min_samples_leaf={min_samples_leaf} criterion={criterion}" + ) + model = DecisionTreeClassifier( + max_depth=max_depth, + min_samples_leaf=min_samples_leaf, + criterion=criterion, + random_state=42, + ) + df_train = datasets["train"] + df_val = datasets["val"] + if ranking: + model.fit( + df_train[feature_columns], + df_train["winner"], + sample_weight=df_train["relative_performance"], + ) + else: + model.fit(df_train[feature_columns], df_train["winner"]) + + model = DecisionTree(model, feature_columns) + + if ranking: + model.prune(df_train, "winner", k=self.ranking_num_choices()) + + unsafe_leaves = self.get_unsafe_leaves(model, df_train, feature_columns) + predictions, proba, leaf_ids = self.predict(model, df_val, feature_columns) + + wrong_pct = self.get_allowed_wrong_prediction_pct() + evaluator = DecisionEvaluator( + self, + model, + predictions, + df_val, + proba, + wrong_pct=wrong_pct, + unsafe_leaves=unsafe_leaves, + leaf_ids=leaf_ids, + k=self.ranking_num_choices(), + ranking=ranking, + ) + safe_proba = evaluator.get_safe_proba() + print(f"safe_proba={safe_proba}") + + def eval(name, df): + if ranking: + # when ranking is enabled, we duplicate each input for each choice that + # is almost as good as the best choice + # we do not want to evaluate the same input multiple times, so we remove duplicates here + df = df[df["winner"] == df["actual_winner"]] + predictions, proba, leaf_ids = self.predict(model, df, feature_columns) + evaluator = DecisionEvaluator( + self, + model, + predictions, + df, + proba, + wrong_pct=wrong_pct, + threshold=safe_proba, + unsafe_leaves=unsafe_leaves, + leaf_ids=leaf_ids, + k=self.ranking_num_choices(), + ranking=ranking, + ) + return evaluator.get_results() + + for dataset_name, dataset in datasets.items(): + eval_result: EvalResults = eval(dataset_name, dataset) + eval_result_metrics = eval_result.to_map() + if dataset_name == "val": + num_correct = eval_result.accuracy.num_correct + num_wrong = eval_result.accuracy.num_wrong + num_total = eval_result.accuracy.total + if num_wrong <= num_total * wrong_pct: + if num_correct > best_model_num_correct: + print( + f"new best model with {num_correct} correct and {num_wrong} wrong" + ) + best_model = model + best_model_num_correct = num_correct + best_model_num_wrong = num_wrong + best_model_safe_proba = safe_proba + best_model_unsafe_leaves = unsafe_leaves + + result = (dataset_name, criterion, max_depth, min_samples_leaf) + result += tuple(eval_result_metrics.values()) + results.append(result) + if len(metrics_columns) == 0: + metrics_columns = list(eval_result_metrics.keys()) + columns += metrics_columns + + return ( + pd.DataFrame(results, columns=columns), + best_model, + best_model_safe_proba, + best_model_unsafe_leaves, + ) + + def get_test_and_val_size(self): + """ + Returns the size of the test and validation sets. + """ + return (0.15, 0.15) + + def prepare_datasets(self, df, other_datasets, cat_feature2cats, ranking=False): + """ + Splits the dataframe into train, val, and test sets. + Also adds other datasets, specified by the user, to the train set. + """ + test_size, val_size = self.get_test_and_val_size() + # Split into train+val and test + df_train_val, df_test = train_test_split( + df, test_size=test_size, random_state=42 + ) + + # Split train+val inputs into train and val + train_val_size = 1 - test_size + df_train, df_val = train_test_split( + df_train_val, test_size=val_size / train_val_size, random_state=42 + ) + datasets = {"train": df_train, "val": df_val, "test": df_test} + self.add_real_datasets(datasets, other_datasets, cat_feature2cats, ranking) + return datasets + + def export_to_dot(self, best_model, df, feature_columns): + """ + Export a learned decision tree to a dot file. + """ + dot_str = best_model.to_dot() + with open("best_model.dot", "w") as f: + f.write(dot_str) + + def get_feature_columns(self, df): + """ + The dataframe contains columns that are not features, such as 'winner', 'speedup' that are only used for + debugging purposes. This function returns the columns that are actually features. + """ + exclude_columns = [ + "speedup", + "winner", + "target", + "avail_choices", + "choice2time", + "index", + "actual_winner", + "relative_performance", + ] + feature_columns = [col for col in df.columns if col not in exclude_columns] + return feature_columns + + def add_training_data(self, df_train, datasets): + return datasets["train"] + + def main( + self, + log_path, + other_datasets, + nrows, + heuristic_name, + save_dot=False, + ranking=False, + ): + """ + Main function that trains a decision tree and generates a heuristic. + """ + # TODO: Enable apply_filters + (df, choices, cat_feature2cats, dummy_col_2_col_val, metadata) = self.get_df( + log_path, nrows=nrows, apply_filters=False, add_near_best=ranking + ) + self.dummy_col_2_col_val = dummy_col_2_col_val + datasets = self.prepare_datasets(df, other_datasets, cat_feature2cats, ranking) + df_train = self.add_training_data(datasets["train"], datasets) + datasets["train"] = df_train + print(datasets["train"]["winner"].value_counts().to_string()) + + feature_columns = self.get_feature_columns(df) + grid_search_values = self.get_grid_search_values() + max_depths = grid_search_values["max_depth"] + min_samples_leafs = grid_search_values["min_samples_leaf"] + criterion_list = grid_search_values["criterion"] + ( + results_df, + best_model, + best_model_safe_proba, + unsafe_leaves, + ) = self.train_and_evaluate_models( + datasets, + max_depths, + min_samples_leafs, + criterion_list, + feature_columns, + ranking=ranking, + ) + + if ranking: + columns_to_keep = [ + "set", + "crit", + "max_depth", + "min_samples_leaf", + "total", + "top_k_correct", + "top_k_wrong", + "top_k_unsure", + "wrong_max_speedup_k", + "wrong_gmean_speedup_k", + ] + results_df = results_df[columns_to_keep] + # prints results for all models and datasets + print(results_df.to_string()) + + sort_metric = "top_k_correct" if ranking else "correct" + # prints results grouped by dataset + for set_name in results_df["set"].unique(): + dataset_results = results_df[results_df["set"] == set_name] + dataset_results = dataset_results.sort_values(by=sort_metric) + print(dataset_results.to_string() + "\n") + + if best_model is not None: + if save_dot: + self.export_to_dot(best_model, df, feature_columns) + self.codegen( + best_model, + metadata, + heuristic_name, + best_model_safe_proba, + dummy_col_2_col_val, + unsafe_leaves, + ) + else: + print( + "All learned models have too many wrong predictions, so no heuristic was generated" + ) + + def get_df( + self, + log_path, + cat_feature2cats=None, + nrows=None, + apply_filters=False, + add_near_best=False, + ): + """ + Parses the log file and processes the data into a dataframe that can be used for training. + """ + (df, metadata, features, categorical_features, choices) = self.parse_log( + log_path, nrows + ) + + def calculate_stats(group): + count = len(group) + has_inf = np.isinf(group["feedback"]).any() + if has_inf: + relative_std = np.inf + median = np.inf + else: + mean = group["feedback"].mean() + std = group["feedback"].std() + relative_std = (std / mean) * 100 if mean != 0 else np.inf + median = group["feedback"].median() + if relative_std > 5: + times = group["feedback"].tolist() + times_str = ", ".join([f"{t:.3f}" for t in sorted(times)]) + log.debug("High relative std: %f. times=%s", relative_std, times_str) + return pd.Series( + { + "count": count, + "relative_std": relative_std, + "median_execution_time": median, + } + ) + + feature_columns = features + stats = ( + df.groupby(feature_columns + ["choice"], as_index=False) + .apply(calculate_stats, include_groups=False) + .reset_index() + ) + + # TODO: We have to be careful with removing certain choices, because if we e.g. remove the winner, the + # heuristic will end up learning wrong things. But, execution times with high variance are also bad + if apply_filters: + # Filter out inputs with less than 3 measurements or high relative std + valid_stats = stats[(stats["count"] >= 3) & (stats["relative_std"] <= 5)] + # Group by input features and count how many valid choices we have for each input + valid_inputs = valid_stats.groupby(feature_columns).filter( + lambda x: len(x) >= 2 + ) + else: + valid_inputs = stats + + # Compute the winner and speedup for each valid input + def get_winner_and_speedup(group): + assert len(group) >= 2, "Need at least 2 choices" + + sorted_group = group.sort_values("median_execution_time") + winner = sorted_group.iloc[0]["choice"] + winning_time = sorted_group.iloc[0]["median_execution_time"] + second_best_time = sorted_group.iloc[1]["median_execution_time"] + speedup = second_best_time / winning_time + unique_choices = group["choice"].unique() + + choice2time = {} + for row in group.itertuples(): + choice2time[row.choice] = row.median_execution_time + + assert len(unique_choices) == len( + group + ), f"len(unique_choices) != len(group): {len(unique_choices)} != {len(group)}" + + return pd.Series( + { + "winner": winner, + "speedup": speedup, + "avail_choices": unique_choices, + "choice2time": json.dumps(choice2time), + } + ) + + results = ( + valid_inputs.groupby(feature_columns, as_index=False) + .filter(lambda x: len(x) >= 2) + .groupby(feature_columns, as_index=False) + .apply(get_winner_and_speedup, include_groups=False) + .reset_index() + ) + + def add_near_best_configs(df): + new_rows = [] + + for index, row in df.iterrows(): + dictionary = json.loads(row["choice2time"]) + min_value = min(dictionary.values()) + + for key, value in dictionary.items(): + new_row = row.copy() + relative_performance = min_value / value + new_row["relative_performance"] = relative_performance + if relative_performance is None or relative_performance is np.inf: + breakpoint() + new_row["actual_winner"] = row["winner"] + new_row["winner"] = key + if relative_performance >= 0.98: + new_rows.append(new_row) + + return pd.DataFrame(new_rows).reset_index(drop=True) + + if add_near_best: + results = add_near_best_configs(results) + (results, added_categorical_features) = self.add_new_features(results) + categorical_features += added_categorical_features + + ( + results, + cat_feature2cats, + dummy_col_2_col_val, + ) = self.handle_categorical_features( + cat_feature2cats, categorical_features, results + ) + return (results, choices, cat_feature2cats, dummy_col_2_col_val, metadata) + + def ranking_always_included_choices(self): + return [] + + def gen_classes(self, classes, num_spaces): + """ + If classes=['choice1', 'choice2', 'choice3'], then this function returns + the following string: + self.choices.append('choice1') + self.choices.append('choice2') + self.choices.append('choice3') + Used in the generated heuristic to map the index of a choice to its name. + """ + indent = " " * num_spaces + return "\n".join([f"{indent}self.choices.append('{c}')" for c in classes]) + + def get_default_config(self, row): + """ + Returns the default config for a given sample. The default config could for example be the config that is + the chosen by a current handwritten heuristic. This can for example be used in get_unsafe_leaf to + compare the predicted config with the default config. + """ + return None + + def gen_predict_fn_def(self): + """ + Generates the definition of the predict function. + """ + return "def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]:" + + def codegen_boilerplate( + self, heuristic_name, opt_name, threshold, shared_memory, device_capa, classes + ): + """ + Generates the boilerplate code for the generated heuristic. This includes things like imports, class definition, + etc. + """ + + boiler_plate = f"""# flake8: noqa: B950 +# fmt: off +# This file was generated by AutoHeuristic. Do not modify it manually! +# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/{opt_name}/ +from typing import List, Optional, Tuple + +from torch._inductor.autoheuristic.autoheuristic_utils import ( + AHContext, + AHMetadata, + Choice, +) +from torch._inductor.autoheuristic.learnedheuristic_interface import ( + LearnedHeuristicDecision, +) + + +class {heuristic_name}(LearnedHeuristicDecision): + + def __init__(self) -> None: + self.choices: List[Choice] = [] + self.fill_choices() + +{self.gen_precondition(opt_name, shared_memory, device_capa)} + + def get_confidence_threshold(self) -> float: + return {threshold} + + def get_choice(self, idx: int) -> Optional[str]: + if idx < len(self.choices): + return self.choices[idx] + return None + + def fill_choices(self) -> None: +{self.gen_classes(classes, num_spaces=8)} + + def get_name(self) -> str: + return '{opt_name}'""" + return boiler_plate + + def add_real_datasets( + self, datasets, other_datasets, cat_feature2cats, ranking=False + ): + """ + Adds datasets specified by the user to the datasets dictionary. + """ + if other_datasets: + for name, path in other_datasets: + (df_other, choices, _, _, _) = self.get_df( + path, + cat_feature2cats=cat_feature2cats, + apply_filters=False, + add_near_best=ranking, + ) + datasets[name] = df_other + + def codegen( + self, + tree, + metadata, + heuristic_name, + threshold, + dummy_col_2_col_val, + unsafe_leaves, + ): + lines = [] + device_capa = metadata["device_capa"] + device_capa_str = f"({device_capa[0]}, {device_capa[1]})" + opt_name = metadata["name"] + lines.append( + self.codegen_boilerplate( + heuristic_name, + opt_name, + threshold, + metadata["shared_memory"], + device_capa_str, + tree.classes_, + ) + ) + fn_def = f"\n {self.gen_predict_fn_def()}" + lines.append(fn_def) + tree.codegen(dummy_col_2_col_val, lines, unsafe_leaves) + self.write_heuristic_to_file(lines, heuristic_name) + + +@dataclass +class AccuracyMetrics: + # Number of correct predictions + num_correct: int + # Number of wrong predictions + num_wrong: int + # Number of predictions where model is unsure + num_unsure: int + # Total number of predictions + total: int + + def to_map(self): + return { + "correct": self.num_correct, + "wrong": self.num_wrong, + "unsure": self.num_unsure, + "total": self.total, + } + + +@dataclass +class WrongSpeedupMetrics: + # If the model predicted the wrong choice, this is the maximum speedup of the best choice over the predicted choice + max_speedup: float + # For all wrong predictions, this is the geometric mean of the speedups of the best choices over the predicted choices + gmean_speedup: float + + def to_map(self): + return { + "wrong_max_speedup": self.max_speedup, + "wrong_gmean_speedup": self.gmean_speedup, + } + + +@dataclass +class RankingMetrics: + # Number of predictions where best choice is in top k choices + num_correct: int + # Number of predictions where best choice is not in top k choices + num_wrong: int + # Maximum speedup of best choice over best choice in top k (this tells us how much better the best choice, which + # is not in top k, is over the best choice in top k) + max_speedup: float + # Geometric mean of speedups of best choice over best choice in top k + gmean_speedup: float + # Number of predictions where model is unsure + unsure: int + + def to_map(self): + return { + "top_k_correct": self.num_correct, + "top_k_wrong": self.num_wrong, + "wrong_max_speedup_k": self.max_speedup, + "wrong_gmean_speedup_k": self.gmean_speedup, + "top_k_unsure": self.unsure, + } + + +@dataclass +class DefaultComparisonMetrics: + # Maximum speedup of predicted choice over default choice + max_speedup: float + # Geometric mean of speedups of predicted choices over default choices + gmean_speedup: float + # Maximum speedup of default choice over predicted choice + max_slowdown: float + # Number of predictions where the predicted choice is not the default choice + non_default_predictions: int + # Number of predictions where the default choice is better than the predicted choice + default_better: bool + + def to_map(self): + return { + "max_speedup_over_default": self.max_speedup, + "gmean_speedup_over_default": self.gmean_speedup, + "max_speedup_default_over_heuristic": self.max_slowdown, + "non_default_predictions": self.non_default_predictions, + "default_better": self.default_better, + } + + +@dataclass +class EvalResults: + accuracy: AccuracyMetrics + speedup: WrongSpeedupMetrics + ranking: RankingMetrics + default_comparison: DefaultComparisonMetrics + + def to_map(self): + return { + **self.accuracy.to_map(), + **self.speedup.to_map(), + **self.ranking.to_map(), + **self.default_comparison.to_map(), + } + + +class DecisionEvaluator: + def __init__( + self, + train, + model, + predictions, + df, + probas, + wrong_pct=0.01, + threshold=0.0, + k=10, + unsafe_leaves=None, + leaf_ids=None, + ranking=False, + ) -> None: + self.train = train + self.model = model + self.predictions = predictions + self.df = df + self.probas = probas + self.wrong_pct = wrong_pct + self.threshold = threshold + self.k = k + self.unsafe_leaves = unsafe_leaves + self.leaf_ids = leaf_ids + self.ranking = ranking + + self.num_correct = 0 + self.num_wrong = 0 + self.num_unsure = 0 + self.wrong_probas = [] + self.speedups_wrong = [] + self.num_correct_top_k = 0 + self.num_wrong_top_k = 0 + self.wrong_speedups_top_k = [] + self.top_k_unsure = 0 + self.num_non_default_predictions = 0 + self.speedups_over_default = [] + self.num_default_better = 0 + + def compute_speedup_over_default(self, default_config, pred, i, predicted_time): + if default_config is not None: + if pred != default_config: + self.num_non_default_predictions += 1 + default_time = self.get_time(self.df.iloc[i], default_config) + # TODO: We should keep track of how often this happens + if default_time is not None and not math.isinf(default_time): + speedup_over_default = default_time / predicted_time + if speedup_over_default < 1: + self.num_default_better += 1 + self.speedups_over_default.append(speedup_over_default) + else: + log.debug( + "cannot compute speedup over default because default_time=%d", + default_time, + ) + + def get_time(self, row, choice): + choices_feedback = json.loads(row["choice2time"]) + return choices_feedback.get(choice, None) + + def top_k_classes(self, model, probas, k, avail_choices): + # Get classes and their corresponding probabilities + classes = model.classes_ + class_proba_pairs = list(zip(classes, probas)) + + # Sort by probability (descending) and filter out zero probabilities + sorted_classes = [ + c + for c, p in sorted(zip(classes, probas), key=lambda x: x[1], reverse=True) + if p > 0 and c in avail_choices + ] + + # Return top k choices + top_k_choices = sorted_classes[:k] + top_k_choices += self.train.ranking_always_included_choices() + top_k_choices = list(dict.fromkeys(top_k_choices)) + return top_k_choices + + def eval_prediction( + self, avail_choices, leaf_id, pred, true, prob, threshold, default_config, i + ): + predicted_time = self.get_time(self.df.iloc[i], pred) + max_prob = max(prob) + if ( + leaf_id in self.unsafe_leaves + or pred not in avail_choices + or (max_prob != 1.0 and max_prob <= threshold) + ): + self.num_unsure += 1 + self.speedups_over_default.append(1.0) + elif pred == true: + self.compute_speedup_over_default(default_config, pred, i, predicted_time) + self.num_correct += 1 + else: + self.compute_speedup_over_default(default_config, pred, i, predicted_time) + self.num_wrong += 1 + self.wrong_probas.append(max_prob) + best_time = self.get_time(self.df.iloc[i], true) + wrong_speedup = predicted_time / best_time + self.speedups_wrong.append(wrong_speedup) + + def eval_ranking_prediction(self, true, top_k_choices, i): + if true in top_k_choices: + self.num_correct_top_k += 1 + else: + top_k_choices_times = [] + for choice in top_k_choices: + time = self.get_time(self.df.iloc[i], choice) + if time is not None: + top_k_choices_times.append(time) + best_time = self.get_time(self.df.iloc[i], true) + min_time = min(top_k_choices_times, default=None) + if min_time is not None: + speedup = min_time / best_time + self.wrong_speedups_top_k.append(speedup) + self.num_wrong_top_k += 1 + else: + self.top_k_unsure += 1 + # TODO (AlnisM): print more info (input and choices) + log.debug( + "All top k choices have no time which means all top k are unavailable" + ) + + def get_safe_proba(self): + return self.get_results(return_safe_proba=True) + + def compute_safe_proba(self, num_predictions, wrong_probas, wrong_pct): + wrong_probas.sort() + num_wrong = len(wrong_probas) + allowed_wrong = int(num_predictions * wrong_pct) + if allowed_wrong >= num_wrong: + return 0.0 + too_many_wrong = num_wrong - allowed_wrong + idx = min(too_many_wrong, len(wrong_probas) - 1) + return wrong_probas[idx] + + def get_results(self, return_safe_proba=False) -> EvalResults: + """ + Custom evaluation function that evaluates a learned decision tree. + """ + + y_true = self.df["actual_winner"] if self.ranking else self.df["winner"] + i = 0 + for pred, true, prob, leaf_id in zip( + self.predictions, y_true, self.probas, self.leaf_ids + ): + avail_choices = self.df["avail_choices"].iloc[i] + top_k_choices = self.top_k_classes( + self.model, prob, k=self.k, avail_choices=avail_choices + ) + assert ( + true in avail_choices + ), f"Best choice {true} not in available choices {avail_choices}" + default_config = self.train.get_default_config(self.df.iloc[i]) + self.eval_prediction( + avail_choices, + leaf_id, + pred, + true, + prob, + self.threshold, + default_config, + i, + ) + self.eval_ranking_prediction(true, top_k_choices, i) + i += 1 + + total = len(self.predictions) + if return_safe_proba: + return self.compute_safe_proba(total, self.wrong_probas, self.wrong_pct) + + def safe_gmean(x): + return gmean(x) if x else 0 + + max_speedup = max(self.speedups_wrong, default=0) + gmean_speedup = safe_gmean(self.speedups_wrong) + max_speedup_top_k = max(self.wrong_speedups_top_k, default=0) + gmean_speedup_top_k = safe_gmean(self.wrong_speedups_top_k) + max_speedup_over_default = max(self.speedups_over_default, default=0) + gmean_speedup_over_default = safe_gmean(self.speedups_over_default) + max_slowdown_over_default = min(self.speedups_over_default, default=0) + + accuracyMetrics = AccuracyMetrics( + self.num_correct, self.num_wrong, self.num_unsure, total + ) + wrongSpeedupMetrics = WrongSpeedupMetrics(max_speedup, gmean_speedup) + rankingMetrics = RankingMetrics( + self.num_correct_top_k, + self.num_wrong_top_k, + max_speedup_top_k, + gmean_speedup_top_k, + self.top_k_unsure, + ) + defaultComparisonMetrics = DefaultComparisonMetrics( + max_speedup_over_default, + gmean_speedup_over_default, + max_slowdown_over_default, + self.num_non_default_predictions, + self.num_default_better, + ) + return EvalResults( + accuracyMetrics, + wrongSpeedupMetrics, + rankingMetrics, + defaultComparisonMetrics, + ) + + +if __name__ == "__main__": + train = AHTrainDecisionTree() + train.generate_heuristic() diff --git a/torchgen/_autoheuristic/train_regression.py b/torchgen/_autoheuristic/train_regression.py new file mode 100644 index 00000000000..1fc48732042 --- /dev/null +++ b/torchgen/_autoheuristic/train_regression.py @@ -0,0 +1,476 @@ +# mypy: ignore-errors + +import warnings + +import numpy as np +import pandas as pd # type: ignore[import-untyped] +from scipy.stats import gmean # type: ignore[import-untyped] +from sklearn.model_selection import train_test_split # type: ignore[import-untyped] +from sklearn.tree import DecisionTreeRegressor # type: ignore[import-untyped] +from train import AHTrain + +from torch._inductor.autoheuristic.autoheuristic_utils import CHOICE_COL, FEEDBACK_COL + + +# TODO (AlnisM): Fix these warnings +warnings.filterwarnings( + "ignore", + message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", +) +warnings.filterwarnings( + "ignore", + message="DataFrameGroupBy.apply operated on the grouping columns.", +) + + +class AHTrainRegressionTree(AHTrain): + """ + This class is responsible for generating a heuristic by using data collected with AutoHeuristic. It will learn a + regression tree that predicts a score that represents how well a specific choice will perform given an input. + A higher score means a better choice. The heuristic will be generated in a file named .py in the + torch/_inductor/autoheuristic/artifacts/ directory. + """ + + def __init__(self): + super().__init__() + + def main( + self, + log_path, + other_datasets, + nrows, + heuristic_name, + save_dot=False, + ranking=False, + ): + """ + Main function that trains a decision tree and generates a heuristic. + """ + (df, choices, cat_feature2cats, dummy_col_2_col_val, metadata) = self.get_df( + log_path, nrows=nrows, apply_filters=True + ) + df_train, df_val, df_test, feature_columns = self.custom_train_test_split(df) + datasets = {"train": df_train, "val": df_val, "test": df_test} + self.add_real_datasets(datasets, other_datasets, cat_feature2cats) + + # We will do a grid search over these values + # Only trying out max_depths of 5, 6, and 7 because we want to keep the tree and + # generated code small, but smaller than 5 does not perform well enough + max_depths = [5, 6, 7] + min_samples_leafs = [1, 2, 5, 10] + choice_columns = [f"{CHOICE_COL}_{choice}" for choice in choices] + (results_df, best_model, threshold) = self.train_and_evaluate_models( + datasets, feature_columns, choice_columns, max_depths, min_samples_leafs + ) + + # prints results for all models and datasets + print(results_df.to_string()) + + # prints results grouped by dataset + for set_name in results_df["dataset"].unique(): + dataset_results = results_df[results_df["dataset"] == set_name] + dataset_results = dataset_results.sort_values(by="correct") + print(dataset_results.to_string() + "\n") + + feature_names = feature_columns + choice_columns + self.dt_to_python( + best_model, + metadata, + feature_names, + dummy_col_2_col_val, + heuristic_name, + threshold, + ) + + def get_df(self, log_path, cat_feature2cats=None, nrows=None, apply_filters=False): + """ + Parses the log file and processes the data into a dataframe that can be used for training. + """ + (df, metadata, feature_columns, categorical_features, choices) = self.parse_log( + log_path, nrows + ) + + def process_data( + df, + feature_columns, + apply_filters, + min_count_measurements=3, + max_relative_std=5, + ): + # Calculate statistics for each input and choice combination + def calculate_stats(group): + count = len(group) + mean = group[FEEDBACK_COL].mean() + std = group[FEEDBACK_COL].std() + relative_std = (std / mean) * 100 if mean != 0 else np.inf + median = group[FEEDBACK_COL].median() + return pd.Series( + { + "count": count, + "median_execution_time": median, + "relative_std": relative_std, + } + ) + + stats = ( + df.groupby(feature_columns + [CHOICE_COL]) + .apply(calculate_stats) + .reset_index() + ) + + if apply_filters: + # Remove unstables measurements + valid_stats = stats[ + (stats["count"] >= min_count_measurements) + & (stats["relative_std"] <= max_relative_std) + ] + # Keep only inputs with at least two valid choices + valid_inputs = valid_stats.groupby(feature_columns).filter( + lambda x: len(x) >= 2 + ) + else: + valid_inputs = stats + + # Compute the winner and ratios for each input + def get_winner_and_speedups(group): + mean_time = group["median_execution_time"].mean() + winner = group.loc[group["median_execution_time"].idxmin(), CHOICE_COL] + min_time = group["median_execution_time"].min() + max_time = group["median_execution_time"].max() + + group["winner"] = winner + group["speedup"] = max_time / min_time + group["target"] = mean_time / group["median_execution_time"] + + return group[ + feature_columns + [CHOICE_COL, "winner", "speedup", "target"] + ] + + results = ( + valid_inputs.groupby(feature_columns) + .apply(get_winner_and_speedups) + .reset_index(drop=True) + ) + + return results + + results = process_data(df, feature_columns, apply_filters) + (results, added_categorical_features) = self.add_new_features(results) + categorical_features += added_categorical_features + categorical_features += [CHOICE_COL] + + ( + results, + cat_feature2cats, + dummy_col_2_col_val, + ) = self.handle_categorical_features( + cat_feature2cats, categorical_features, results + ) + return (results, choices, cat_feature2cats, dummy_col_2_col_val, metadata) + + def custom_train_test_split( + self, df, test_size=0.2, val_size=0.25, random_state=42 + ): + """ + Splits the dataframe into train, val, and test sets. + Also adds other datasets, specified by the user, to the train set. + We need to be careful, because we want to make sure that rows with the same input but different choice are + kept in the same set, e.g. + Rows that looks like this + input_1,choice1,... + input_1,choice2,... + should be in the same set. + """ + # We want to make sure that rows with the same input but different choice are kept in the same set + exclude_columns = ["speedup", "winner", "target"] + feature_columns = [ + col + for col in df.columns + if col not in exclude_columns and not col.startswith(CHOICE_COL + "_") + ] + df["input_id"] = df.groupby(feature_columns).ngroup() + + # Get unique input IDs + unique_inputs = df["input_id"].unique() + + # Split unique inputs into train+val and test + train_val_inputs, test_inputs = train_test_split( + unique_inputs, test_size=test_size, random_state=random_state + ) + + # Split train+val inputs into train and val + train_inputs, val_inputs = train_test_split( + train_val_inputs, test_size=val_size, random_state=random_state + ) + + # Create masks for each set + train_mask = df["input_id"].isin(train_inputs) + val_mask = df["input_id"].isin(val_inputs) + test_mask = df["input_id"].isin(test_inputs) + + # Split the dataframe + df_train = df[train_mask] + df_val = df[val_mask] + df_test = df[test_mask] + + # Remove the temporary input_id column + df_train = df_train.drop("input_id", axis=1) + df_val = df_val.drop("input_id", axis=1) + df_test = df_test.drop("input_id", axis=1) + + return df_train, df_val, df_test, feature_columns + + def train_and_evaluate_models( + self, + datasets, + feature_columns, + choice_columns, + max_depths, + min_samples_leafs, + threshold=0.99, + ): + """ + Does a grid search over max_depths, min_samples_leafs, and returns the best model. + """ + + results = [] + df_train = datasets["train"] + df_val = datasets["val"] + + best_model = None + best_model_threshold = 0 + max_correct_predictions = -1 + for max_depth in max_depths: + for min_samples_leaf in min_samples_leafs: + print( + f"Evaluating max_depth={max_depth}, min_samples_leaf={min_samples_leaf}" + ) + model = DecisionTreeRegressor( + random_state=42, + max_depth=max_depth, + min_samples_leaf=min_samples_leaf, + ) + model.fit( + df_train[feature_columns + choice_columns], df_train["target"] + ) + + # we first compute a safe threshold: this threshold ensures that on the validation set, + # if the heuristic returns a choice, the choice will be correct, although a high threshold + # can lead to a lot of 'unsure' choices + eval_result = self.evaluate_model( + model, df_val, feature_columns, choice_columns, threshold + ) + safe_threshold = eval_result["wrong_max_ratio"] + for dataset_name, dataset in datasets.items(): + eval_result = self.evaluate_model( + model, dataset, feature_columns, choice_columns, safe_threshold + ) + print(eval_result) + if dataset_name == "val": + eval_correct = eval_result["correct"] + if eval_correct > max_correct_predictions: + best_model = model + best_model_threshold = safe_threshold + max_correct_predictions = eval_correct + results.append( + { + "max_depth": max_depth, + "min_samples_leaf": min_samples_leaf, + "dataset": dataset_name, + "correct": eval_result["correct"], + "wrong": eval_result["wrong"], + "unsure": eval_result["unsure"], + "total": eval_result["total"], + "max_wrong_speedup": eval_result["max_wrong_speedup"], + "gman_wrong_speedup": eval_result["gman_wrong_speedup"], + "threshold": safe_threshold, + } + ) + + return (pd.DataFrame(results), best_model, best_model_threshold) + + def evaluate_model(self, model, df, feature_columns, choice_columns, threshold): + """ + Custom evaluation function that evaluates a learned decision tree. + """ + + def predict_winner(group): + predictions = model.predict(group[feature_columns + choice_columns]) + + # Find the index of the maximum prediction (best choice) + best_choice_index = np.argmax(predictions) + + # Get the corresponding choice + predicted_choice = ( + group[choice_columns].iloc[best_choice_index].idxmax().split("_")[-1] + ) + + # Calculate the ratio between the best and second-best prediction + sorted_predictions = np.sort(predictions)[::-1] + top_pred_ratio = ( + sorted_predictions[0] / sorted_predictions[1] + if len(sorted_predictions) > 1 + else np.inf + ) + + # If the best choice is not "significantly" better than the second best choice, + # the learned heuristic will return "unsure" + if top_pred_ratio <= threshold: + predicted_winner = "unsure" + else: + predicted_winner = predicted_choice + + actual_winner = group["winner"].iloc[0] + is_correct = ( + predicted_winner == actual_winner + if predicted_winner != "unsure" + else "unsure" + ) + + return pd.Series( + { + "predicted_winner": predicted_winner, + "ratio": top_pred_ratio, + "actual_winner": actual_winner, + "is_correct": is_correct, + "speedup": group["speedup"].iloc[ + 0 + ], # Speedup is the same for all rows in the group + } + ) + + results = df.groupby(feature_columns).apply(predict_winner).reset_index() + correct = (results["is_correct"].eq(True)).sum() + unsure = (results["is_correct"] == "unsure").sum() + wrong_results = results[results["is_correct"].eq(False)] + wrong = len(wrong_results) + + # Calculate max and geometric mean of speedup for wrong predictions + # Used for debugging purposes + wrong_speedups = wrong_results["speedup"] + max_wrong_speedup = wrong_speedups.max() if not wrong_speedups.empty else np.nan + geo_mean_wrong_speedup = ( + gmean(wrong_speedups) if not wrong_speedups.empty else np.nan + ) + wrong_max_ratio = wrong_results["ratio"].max() + + total = correct + wrong + unsure + return { + "correct": correct, + "wrong": wrong, + "unsure": unsure, + "total": total, + "max_wrong_speedup": max_wrong_speedup, + "gman_wrong_speedup": geo_mean_wrong_speedup, + "wrong_max_ratio": wrong_max_ratio, + } + + def dt_to_python( + self, + dt, + metadata, + feature_names, + dummy_col_2_col_val, + heuristic_name, + threshold, + unsafe_leaves=None, + ): + tree_ = dt.tree_ + feature_name = [ + feature_names[i] if i != -1 else "undefined!" for i in tree_.feature + ] + + lines = [] + device_capa = metadata["device_capa"] + device_capa_str = f"({device_capa[0]}, {device_capa[1]})" + opt_name = metadata["name"] + lines.append( + self.codegen_boilerplate( + heuristic_name, + opt_name, + threshold, + metadata["shared_memory"], + device_capa_str, + dt, + ) + ) + fn_def = f"\n {self.gen_predict_fn_def()}" + lines.append(fn_def) + + def dt_to_python(node, depth): + indent = " " * (depth + 1) + false_predicate = "" + if tree_.feature[node] != -2: + name = feature_name[node] + threshold = tree_.threshold[node] + if name in dummy_col_2_col_val: + (orig_name, value) = dummy_col_2_col_val[name] + predicate = f"{indent}if str(context.get_value('{orig_name}')) != '{value}':" + assert ( + threshold == 0.5 + ), f"expected threshold to be 0.5 but is {threshold}" + else: + predicate = ( + f"{indent}if context.get_value('{name}') <= {threshold}:" + ) + lines.append(predicate) + dt_to_python(tree_.children_left[node], depth + 1) + lines.append(f"{indent}else:") + dt_to_python(tree_.children_right[node], depth + 1) + else: + lines.append(self.handle_leaf(tree_, node, indent, unsafe_leaves)) + + dt_to_python(0, 1) + + self.write_heuristic_to_file(lines, heuristic_name) + + def handle_leaf(self, tree_, node, indent, unsafe_leaves): + """ + Generates the code for a leaf node. This is just the value predicted by the regression tree. + """ + value = tree_.value[node][0][0] + return f"{indent}return {str(value)}" + + def gen_predict_fn_def(self): + return "def predict(self, context: AHContext) -> float:" + + def codegen_boilerplate( + self, heuristic_name, opt_name, threshold, shared_memory, device_capa, classes + ): + """ + Generates the boilerplate code for the generated heuristic. This includes things like imports, class definition, + etc. + """ + + boiler_plate = f"""# flake8: noqa: B950 +# fmt: off +# This file was generated by AutoHeuristic. Do not modify it manually! +# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/{opt_name}/ +from torch._inductor.autoheuristic.autoheuristic_utils import AHContext, AHMetadata, Choice, CHOICE_COL +from torch._inductor.autoheuristic.learnedheuristic_interface import ( + LearnedHeuristicRegression, +) + + +class {heuristic_name}(LearnedHeuristicRegression): + + def __init__(self) -> None: + pass + +{self.gen_precondition(opt_name, shared_memory, device_capa)} + + def get_feedback(self, context: AHContext, choice: Choice) -> float: + context.context_dict[CHOICE_COL] = choice + return self.predict(context) + + def get_confidence_threshold(self) -> float: + return {threshold} + + def get_name(self) -> str: + return '{opt_name}'""" + return boiler_plate + + +if __name__ == "__main__": + train = AHTrain() + train.generate_heuristic() diff --git a/torchgen/aoti/__init__.py b/torchgen/aoti/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py new file mode 100644 index 00000000000..5c3890824b9 --- /dev/null +++ b/torchgen/aoti/fallback_ops.py @@ -0,0 +1,150 @@ +# Be extra careful when you edit this file, because it affects AOTInductor ABI compatbility. See +# https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 +# for details. +# +# The inductor_fallback_ops list is based on the fallback ops from torch/_inductor/lowering.py. +# Generally speaking, it is ok to add a new op to the list, but you need to run +# `python torchgen/gen.py --update-aoti-c-shim` in order to regenerate C shim header files. +# But it is NOT ok to remove an existing fallback op from the list, since that will break +# some existing AOTInductor-compiled models. +inductor_fallback_ops = { + "aten._adaptive_avg_pool2d_backward.default", + "aten._adaptive_avg_pool2d.default", + "aten._adaptive_avg_pool3d.default", + "aten._adaptive_avg_pool3d_backward.default", + "aten.adaptive_max_pool2d_backward.default", + "aten.adaptive_max_pool2d.default", + "aten.adaptive_max_pool3d.default", + "aten.adaptive_max_pool3d_backward.default", + "aten.addbmm.default", + "aten._addmm_activation.default", + "aten.addmm.out", + "aten.addmv.default", + "aten.angle.default", + "aten.avg_pool2d_backward.default", + "aten.avg_pool2d.default", + "aten.avg_pool3d_backward.default", + "aten.avg_pool3d.default", + "aten.bernoulli_.float", + "aten.bernoulli_.Tensor", + "aten.bmm.out", + "aten.bucketize.Tensor", + "aten.cat.default", + "aten._cdist_backward.default", + "aten._cdist_forward.default", + "aten.cholesky_inverse.default", + "aten.cholesky_solve.default", + "aten.convolution_backward.default", + "aten._cudnn_rnn.default", + "aten._cudnn_rnn_backward.default", + "aten.convolution.default", + "aten.cummax.default", + "aten.cummin.default", + "aten.cumprod.default", + "aten.cumsum.default", + "aten._efficient_attention_backward.default", + "aten._efficient_attention_forward.default", + "aten._efficientzerotensor.default", + "aten._embedding_bag.default", + "aten._embedding_bag_dense_backward.default", + "aten._embedding_bag_forward_only.default", + "aten._embedding_bag_per_sample_weights_backward.default", + "aten.exponential.default", + "aten._fft_c2c.default", + "aten._fft_r2c.default", + "aten._flash_attention_backward.default", + "aten._flash_attention_forward.default", + "aten.fractional_max_pool2d_backward.default", + "aten.fractional_max_pool2d.default", + "aten.fractional_max_pool3d.default", + "aten.fractional_max_pool3d_backward.default", + "aten._fused_moving_avg_obs_fq_helper.default", + "aten._fused_moving_avg_obs_fq_helper_functional.default", + "aten.gcd.default", + "aten.geqrf.default", + "aten.grid_sampler_2d_backward.default", + "aten.histc.default", + "aten.histogram.bin_ct", + "aten._histogramdd_bin_edges.default", + "aten._histogramdd_from_bin_cts.default", + "aten.index_put.default", + "aten.index_reduce.default", + "aten.index.Tensor", + "aten.kthvalue.default", + "aten.logcumsumexp.default", + "aten.lu_unpack.default", + "aten.masked_scatter.default", + "aten.masked_scatter_backward.default", + "aten.max_pool2d_with_indices_backward.default", + "aten.max_pool2d_with_indices.default", + "aten.max_pool3d_with_indices.default", + "aten.max_pool3d_with_indices_backward.default", + "aten.max_unpool2d.default", + "aten.max_unpool3d.default", + "aten.median.default", + "aten.mm.out", + "aten.mode.default", + "aten.mul.Scalar", + "aten.mul.Tensor", + "aten.nanmedian.default", + "aten.native_dropout.default", + "aten.normal_functional.default", + "aten.nonzero.default", + "aten.ormqr.default", + "aten._pdist_backward.default", + "aten._pdist_forward.default", + "aten.polar.default", + "aten.pow.Scalar", + "aten.pow.Tensor_Scalar", + "aten.pow.Tensor_Tensor", + "aten.rand.default", + "aten.rand.generator", + "aten.randint.default", + "aten.randint.generator", + "aten.randint.low", + "aten.randint.low_out", + "aten.randn.default", + "aten.randn.generator", + "aten.randperm.default", + "aten.repeat_interleave.Tensor", + "aten.replication_pad1d_backward.default", + "aten.replication_pad2d_backward.default", + "aten.reshape.default", + "aten.resize_.default", + "aten.resize_as_.default", + "aten._scaled_dot_product_efficient_attention_backward.default", + "aten._scaled_dot_product_efficient_attention.default", + "aten._scaled_dot_product_flash_attention_backward.default", + "aten._scaled_dot_product_flash_attention.default", + "aten._scaled_dot_product_cudnn_attention_backward.default", + "aten._scaled_dot_product_cudnn_attention.default", + "aten._scaled_dot_product_flash_attention_for_cpu_backward.default", + "aten._scaled_dot_product_flash_attention_for_cpu.default", + "aten._scaled_mm.default", + "aten._scaled_mm.out", + "aten.scatter_reduce.two_out", + "aten.scatter.src_out", + "aten.scatter.value_out", + "aten.searchsorted.default", + "aten._segment_reduce_backward.default", + "aten.segment_reduce.default", + "aten.slice.Tensor", + "aten.soft_margin_loss_backward.default", + "aten.sort.default", + "aten.sort.stable", + "aten._sparse_coo_tensor_with_dims_and_tensors.default", + "aten._thnn_fused_lstm_cell.default", + "aten.topk.default", + "aten._to_sparse.default", + "aten.to_sparse.default", + "aten.triangular_solve.default", + "aten._trilinear.default", + "aten.uniform.default", + "aten.upsample_bicubic2d_backward.default", + "aten.upsample_linear1d_backward.default", + "aten.upsample_trilinear3d_backward.default", + "aten.view_as_complex.default", + "aten.view_as_real.default", + "aten.view.dtype", + "aten.zeros.names", +} diff --git a/torchgen/api/__init__.py b/torchgen/api/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchgen/api/autograd.py b/torchgen/api/autograd.py new file mode 100644 index 00000000000..644069395e1 --- /dev/null +++ b/torchgen/api/autograd.py @@ -0,0 +1,870 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import cast, Sequence + +from torchgen import local +from torchgen.api import cpp +from torchgen.api.types import BaseCType, Binding, NamedCType, tensorListT +from torchgen.model import ( + BaseTy, + BaseType, + FunctionSchema, + ListType, + NativeFunction, + NativeFunctionsViewGroup, + SchemaKind, + Type, +) +from torchgen.utils import IDENT_REGEX + + +# Represents a saved attribute involved in backward calculation. +# Note that it can be a derived property of an input argument, e.g.: +# we could save `other.scalar_type()` instead of the entire `other` tensor. +@dataclass(frozen=True) +class SavedAttribute: + # The NamedCType holds the updated name and cpp type of the attribute + # for the name, Suffix is appended if it's derived property, e.g.: `other_scalar_type` + nctype: NamedCType + + # The expression to read the derived property at save time, e.g.: + # `other.scalar_type()`. + expr: str + + +# Represents a backward formula that calculates derivatives for one +# or more tensors. +@dataclass(frozen=True) +class Derivative: + # The formula string (legit C++ expression). + # Note that expressions against input arguments have been replaced with the + # corresponding saved attributes. + # E.g.: + # raw formula: `mul_tensor_backward(grad, self, other.scalar_type())` + # here: `mul_tensor_backward(grad, self, other_scalar_type)` + formula: str + + # The formula string before input argument replacement + original_formula: str + + # Names of the arguments for which this formula calculates derivatives. + var_names: tuple[str, ...] + + # Saved inputs that are referenced by the formula. + saved_inputs: tuple[SavedAttribute, ...] + + # Saved outputs that are referenced by the formula. + saved_outputs: tuple[SavedAttribute, ...] + + # Gradients that are referenced by name in the formula. + named_gradients: set[str] + + +# Represents a forward formula that calculates forward derivatives +# for one tensor. +@dataclass(frozen=True) +class ForwardDerivative: + # The formula string (legit C++ expression). + # Note that special keywords such as "linear" or "element_wise" have been + # replaced by the automatically generated formula. + formula: str + + # Name of the output arguments for which this formula calculates forward + # derivatives + var_names: tuple[str, ...] + + # Type of the output arguments for which this formula calculates forward + # derivatives + var_types: tuple[Type, ...] + + # Inputs for which the forward derivatives are required for this formula + required_inputs_fw_grad: tuple[str, ...] | None + + # Inputs for which the primal is required for this formula + required_inputs_primal: tuple[str, ...] | None + + # Flag to specify if this formula requires the original value of self + # This is only used by inplace operations + required_original_self_value: bool + + # If this formula is specified in derivatives.yaml or if we are re-using the + # out of place formula for inplace + is_reusing_outplace_formula: bool + + +# Represents differentiability info for a NativeFunction. +@dataclass(frozen=True) +class DifferentiabilityInfo: + # The base name read from derivatives.yaml. + name: str + + # The matching native function. + # + # There can be multiple NativeFunction having the same base name: + # - different overloads with different types of input arguments; + # - in-place/out/functional variants of the same function; + # + # We first use the schema string (under the 'name' key) in derivatives.yaml + # to find the NativeFunction having the same schema string. + # Then we find the in-place/out/functional variants of the matching function. + # Among these variants, we choose the one having the same name as the + # derivatives.yaml entry. If there is no exact match, then we choose the + # in-place variant. + # TODO: maybe the logic to search for all variants is no longer necessary? + func: NativeFunction + + # The name of the generated autograd function. + # It's set only if we will calculate a derivative, i.e. + # 'args_with_derivatives' is not empty. + op: str | None + + # The derivatives formulae for this function. + # Note that the length of this sequence is the number of differentiable inputs + derivatives: Sequence[Derivative] + + # The forward derivatives formulae for this function. + # Note that the length of this sequence is the number of differentiable outputs + forward_derivatives: Sequence[ForwardDerivative] + + # The union of 'saved_inputs' of all 'derivatives'. + all_saved_inputs: Sequence[SavedAttribute] + + # The union of 'saved_outputs' of all 'derivatives'. + all_saved_outputs: Sequence[SavedAttribute] + + # All named gradients that are available for use, in the same + # order as in the grads vector. + available_named_gradients: Sequence[str] + + # The named gradients that are used in any of the derivatives. + # Invariant: all(name in available_named_gradients for name in used_named_gradients) + used_named_gradients: set[str] + + # The function's input arguments for which it calculates derivatives. + # It's the union of 'var_names' of all 'derivatives', sorted by the + # argument order in the function schema. + args_with_derivatives: Sequence[Binding] + + # Names of arguments whose derivative formula is 'non_differentiable'. + non_differentiable_arg_names: Sequence[str] + + # Raw data read from derivatives.yaml. + output_differentiability: list[bool] | None + + # output_differentiability in derivatives.yaml can be a list of + # conditions that express if the output is differentiable. In this case, + # the number of conditions must match the number of outputs + # (NB: we only support one condition right now). + # output_differentiability gets populated with True for each condition, + # while output_differentiability_conditions gets populated with the conditions + output_differentiability_conditions: list[str] | None + + @property + def has_derivatives(self) -> bool: + return len(self.args_with_derivatives) > 0 + + # Generates a new DifferentiabilityInfo using the exact same set of derivative information, + # but with a new operator name. + # This is used when generating "copy" variants of view ops, + # which are able to use the exact same derivative formula as the original view op + # See Note [Codegen'd {view}_copy Operators] + def create_view_copy_from_view_derivative( + self, g: NativeFunctionsViewGroup + ) -> DifferentiabilityInfo | None: + if g.view_copy is None: + return None + f = g.view_copy + + name_split_by_period = self.name.split(".", maxsplit=2) + # Append a "_copy" to the base name of the operator (but keep the overload name the same) + view_copy_name = f"{name_split_by_period[0]}_copy." + ".".join( + name_split_by_period[1:] + ) + view_copy_op_name = None if self.op is None else f"{self.op}_copy" + + return DifferentiabilityInfo( + # Use the "_copy" version of name/func/op + name=view_copy_name, + func=f, + op=view_copy_op_name, + # But keep all derivative info the same + derivatives=self.derivatives, + forward_derivatives=self.forward_derivatives, + all_saved_inputs=self.all_saved_inputs, + all_saved_outputs=self.all_saved_outputs, + available_named_gradients=self.available_named_gradients, + used_named_gradients=self.used_named_gradients, + args_with_derivatives=self.args_with_derivatives, + non_differentiable_arg_names=self.non_differentiable_arg_names, + output_differentiability=self.output_differentiability, + output_differentiability_conditions=self.output_differentiability_conditions, + ) + + +def uses_ident(info: DifferentiabilityInfo | None, ident: str) -> bool: + if info is None: + return False + for derivative in info.derivatives: + formula = derivative.formula + if re.search(IDENT_REGEX.format(ident), formula): + return True + return False + + +def uses_retain_variables(info: DifferentiabilityInfo | None) -> bool: + return uses_ident(info, "retain_variables") + + +def uses_single_grad(info: DifferentiabilityInfo | None) -> bool: + return uses_ident(info, "grad") + + +# Represents a differentiable `Argument`. +# How is it different from the `Argument` type? +# - It's processed Arguments which are differentiable and only used in the +# context of the autograd codegen; +# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument; +@dataclass(frozen=True) +class DifferentiableInput: + name: str + type: Type + + # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. + cpp_type: str + + +# Represents a differentiable `Return`. +# How it it different from the `Return` type? +# - The name in `Return` is optional. Here it is always populated using the same +# `cpp.return_names()` method. +# TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant? +# - It's processed Returns which are differentiable, in compliance with the +# `output_differentiability` field defined in derivatives.yaml (if specified), +# and are only used in the context of the autograd codegen; +@dataclass(frozen=True) +class DifferentiableOutput: + name: str + type: Type + + # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. + cpp_type: str + + +@dataclass(frozen=True) +class NativeFunctionWithDifferentiabilityInfo: + func: NativeFunction + info: dict[str, DifferentiabilityInfo] | None + fw_derivatives: dict[str, Sequence[ForwardDerivative]] | None + + +# TODO: Update comment below since it is out of date. +def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str: + """How are we going to call the underlying implementation of a + declaration? There are two strategies: + - use_derived: we want to call the implementation on CPUDoubleType + (or a similar, derived Type instance). Because these derived + instances deal in Tensors, not Variables (it's a completely different + object, so it doesn't dispatch back to VariableType), code on + this dispatch path needs to wrap/unwrap tensors. If the + derived implementation takes and returns tensors, the + implementation is usually differentiable (although we also use + the derived dispatch path for non-differentiable functions + that we still want to dispatch on the derived Type instance; + e.g., size()) + - use_type: we want to call the implementation on Type, because + it is implemented concretely, and the functions it invokes will + get dispatched back to VariableType (which will ensure that they + are differentiable.) + """ + # fn is derived as long as any of its per-key differentiability infos + # has_derivatives. dispatch_strategy() is used to guard generation of fns in VariableType + # and ADInplaceOrViewType. We want to generate these functions as long as a + # derivative is defined for ANY dispatch key. + if fn.func.is_abstract or ( + fn.info is not None and any(info.has_derivatives for info in fn.info.values()) + ): + # If the function is abstract (not implemented on at::Type), we must + # call the implementation on the derived type with unpacked tensors. + + # If the function has a derivative specified and is concrete, we could + # call either implementation. We prefer the calling the derived + # type's implementation with unpacked tensors because it is more + # performant in some cases: any internal calls to other ATen functions + # won't have the history tracked. + + # If the function has a type dispatched argument (i.e. is a factory), + # we prefer calling the derived type's implementation both because it is + # more performant and to ensure factory functions return tensors with _version + # of 0 (probably not strictly necessary, but nice to have to keeps versions simple + # to understand. + + return "use_derived" + else: + # If the function is concrete (we don't have to override it) and we + # didn't declare it in derivatives.yaml, we'll assume that it is + # actually implemented out of differentiable functions. (This + # assumption might not hold, but then you'll see gradcheck fail.) + return "use_type" + + +def is_foreach_func(f: NativeFunction) -> bool: + return f.func.name.name.base.startswith("_foreach_") + + +# note(crcrpar): Most foreach functions can reference an out-place `torch` function whose schema kind +# is functional for their backward derivatives (and forward derivatives in the future), i.e., +# they would find such one in `functional_info_by_signature`. There however are some exceptions: +_foreach_with_inplace_ref = {"_foreach_zero_"} +_foreach_with_tensor_overload = { + "_foreach_add.Tensor", + "_foreach_mul.Tensor", + "_foreach_div.Tensor", +} +# The following do not support the alpha kwarg, which the nonforeach versions support. +_skip_argument_len_check = { + "_foreach_add.Scalar", + "_foreach_add_.Scalar", + "_foreach_add.ScalarList", + "_foreach_add_.ScalarList", + "_foreach_sub.Scalar", + "_foreach_sub_.Scalar", + "_foreach_sub.ScalarList", + "_foreach_sub_.ScalarList", +} + + +# Checks if `function_schema` is a native, non-foreach function which `f`, a foreach function +# reference to generate derivatives. +def is_reference_for_foreach( + f: NativeFunction, + function_schema: FunctionSchema, +) -> bool: + return ( + f.func.name.name.base.split("_foreach_")[-1] == function_schema.name.name.base + and ( + not function_schema.name.name.inplace + or str(f.func.name) in _foreach_with_inplace_ref + ) + and ( + str(f.func.name) in _skip_argument_len_check + or len(f.func.arguments.flat_non_out) + == len(function_schema.arguments.flat_non_out) + ) + and all( + ref_arg.type in (arg.type, getattr(arg.type, "elem", None)) + for arg, ref_arg in zip( + f.func.arguments.flat_non_out, + function_schema.arguments.flat_non_out, + ) + ) + ) + + +# TODO(crcrpar): Avoid hard coding "Default" ideally. +def gen_foreach_derivativeinfo( + foreach_function: NativeFunction, + functional_info_by_signature: dict[ + FunctionSchema, dict[str, DifferentiabilityInfo] + ], + non_functional_info_by_signature: dict[ + FunctionSchema, dict[str, DifferentiabilityInfo] + ], + dispatch_key: str = "Default", +) -> tuple[DifferentiabilityInfo | None, bool]: + """Generate DifferentiabilityInfo for out-place foreach function, return the existing one for in-place. + + The second return value indicates whether the info is generated in this function. + """ + ref_diff_info: DifferentiabilityInfo | None = None + + for function_schema, diff_info in functional_info_by_signature.items(): + if not is_reference_for_foreach(foreach_function, function_schema): + continue + ref_diff_info = diff_info[dispatch_key] + if ref_diff_info is not None: + break + # note(crcrpar): It seems like `zero`'s info isn't available in functional_info_by_signature + # while the info of `zero_` is in non_functional_info_by_signature + if ( + ref_diff_info is None + and foreach_function.func.kind() == SchemaKind.inplace + and str(foreach_function.func.name) in _foreach_with_inplace_ref + ): + for function_schema, diff_info in non_functional_info_by_signature.items(): + if not is_reference_for_foreach(foreach_function, function_schema): + continue + ref_diff_info = diff_info[dispatch_key] + if ref_diff_info is not None: + break + if ref_diff_info is None: + return None, False + + # non out-place uses the existing Derivative. + if foreach_function.func.kind() == SchemaKind.inplace: + return ref_diff_info, False + + map_refarg2foreacharg, map_name2arg = {}, {} + for i, (arg, ref_arg) in enumerate( + zip( + foreach_function.func.arguments.flat_non_out, + function_schema.arguments.flat_non_out, + ) + ): + map_refarg2foreacharg[ref_arg.name] = arg.name + map_name2arg[arg.name] = arg + + all_saved_inputs, all_saved_outputs, all_var_names = [], [], [] + modified_derivative_formulas = [] + for i, derivative in enumerate(ref_diff_info.derivatives): + modified_formula = derivative.formula.replace("grad", "grads[i]").replace( + "result", "result[i]" + ) + saved_inputs, saved_outputs = [], [] + # note(crcrpar): This context seems necessary to call `cpp.argument_type` + with local.parametrize( + use_const_ref_for_mutable_tensors=foreach_function.use_const_ref_for_mutable_tensors, + use_ilistref_for_tensor_lists=foreach_function.part_of_structured_group, + ): + for ref_input in derivative.saved_inputs: + ref_input_jit_name = ref_input.expr.split(".")[0] + mapped_name = map_refarg2foreacharg[ref_input_jit_name] + if isinstance(map_name2arg[mapped_name].type, ListType): + mapped_expr = mapped_name + "[i]" + else: + mapped_expr = mapped_name + new_expr = ref_input.expr.replace(ref_input_jit_name, mapped_expr) + modified_formula = modified_formula.replace( + cast(str, ref_input.nctype.name), new_expr + ) + + nctype = cpp.argument_type(map_name2arg[mapped_name], binds=mapped_name) + canonical_nctype = NamedCType( + nctype.name, nctype.type.remove_const_ref() + ) + saved_inputs.append( + SavedAttribute(nctype=canonical_nctype, expr=mapped_name) + ) + for ref_output in derivative.saved_outputs: + if ref_output.nctype.name == "result": + saved_outputs.append( + SavedAttribute( + nctype=NamedCType( + name="result", type=BaseCType(tensorListT) + ), + expr="result", + ) + ) + else: + raise RuntimeError("") + var_names = [map_refarg2foreacharg[var] for var in derivative.var_names] + all_var_names.extend(var_names) + all_saved_inputs.extend(saved_inputs) + all_saved_outputs.extend(saved_outputs) + modified_derivative = Derivative( + formula=modified_formula, + original_formula=derivative.formula, + var_names=tuple(var_names), + saved_inputs=tuple(saved_inputs), + saved_outputs=tuple(saved_outputs), + named_gradients=set(), + ) + modified_derivative_formulas.append(modified_derivative) + + with local.parametrize( + use_const_ref_for_mutable_tensors=foreach_function.use_const_ref_for_mutable_tensors, + use_ilistref_for_tensor_lists=foreach_function.part_of_structured_group, + ): + args_with_derivatives = [ + Binding( + name=arg.name, + nctype=cpp.argument_type(arg, binds=arg.name), + argument=arg, + default=None, + ) + for arg in foreach_function.func.arguments.flat_non_out + if arg.name in all_var_names + ] + + forward_derivatives: list[ForwardDerivative] = [] + fw_derivative: ForwardDerivative + for fw_derivative in ref_diff_info.forward_derivatives: + var_names: list[str] = list(fw_derivative.var_names) # type: ignore[no-redef] + var_types: list[Type] = list(fw_derivative.var_types) + required_inputs_fw_grad: list[str] = [] + required_inputs_primal: list[str] = [] + if fw_derivative.required_inputs_fw_grad is not None: + required_inputs_fw_grad = list(fw_derivative.required_inputs_fw_grad) + if fw_derivative.required_inputs_primal: + required_inputs_primal = list(fw_derivative.required_inputs_primal) + modified_formula = fw_derivative.formula + + # Foreach's result is TensorList + if "result" in modified_formula: + modified_formula = fw_derivative.formula.replace("result", "result[i]") + + for foreach_arg, ref_arg in zip( + foreach_function.func.arguments.flat_non_out, + ref_diff_info.func.func.arguments.flat_non_out, + ): + # Modify reference forward formula + if ( + isinstance(foreach_arg.type, ListType) + and not foreach_arg.type.is_tensor_like() + ): + # Assuming ScalarList + modified_formula = modified_formula.replace( + ref_arg.name, foreach_arg.name + "[i]" + ) + elif foreach_arg.type.is_tensor_like(): + # Assuming TensorList / Tensor + # assert isinstance(foreach_arg.type, ListType), f"{foreach_function.func.name}, {foreach_arg.type}" + assert isinstance(foreach_arg.type, ListType) or ( + foreach_arg.type == BaseType(BaseTy.Tensor) + and str(foreach_function.func.name) in _foreach_with_tensor_overload + ), f"{foreach_function.func.name}, {foreach_arg.type}" + for suffix in ("_p", "_t"): + curr_expr = ref_arg.name + suffix + if curr_expr in modified_formula: + new_expr = foreach_arg.name + suffix + modified_formula = modified_formula.replace(curr_expr, new_expr) + else: + # Assuming Scalar + if foreach_arg.name != ref_arg.name: + modified_formula = modified_formula.replace( + ref_arg.name, foreach_arg.name + ) + + # note(crcrpar): there should exist a cooler way... + for i, name in enumerate(var_names): + if name == ref_arg.name: + var_names[i] = foreach_arg.name + var_types[i] = foreach_arg.type + for i, name in enumerate(required_inputs_fw_grad): + if name == ref_arg.name: + required_inputs_fw_grad[i] = foreach_arg.name + for i, name in enumerate(required_inputs_primal): + if name == ref_arg.name: + required_inputs_primal[i] = foreach_arg.name + forward_derivatives.append( + ForwardDerivative( + formula=modified_formula, + var_names=tuple(var_names), + var_types=tuple(var_types), + required_inputs_fw_grad=tuple(required_inputs_fw_grad), + required_inputs_primal=tuple(required_inputs_primal), + required_original_self_value=fw_derivative.required_original_self_value, + is_reusing_outplace_formula=fw_derivative.is_reusing_outplace_formula, + ) + ) + + return ( + DifferentiabilityInfo( + name=foreach_function.func.name.name.base, + func=foreach_function, + op=f"Foreach{ref_diff_info.op}{foreach_function.func.name.overload_name}", + derivatives=modified_derivative_formulas, + forward_derivatives=forward_derivatives, + all_saved_inputs=tuple(set(all_saved_inputs)), + all_saved_outputs=tuple(set(all_saved_outputs)), + available_named_gradients=(), + used_named_gradients=set(), + args_with_derivatives=args_with_derivatives, + non_differentiable_arg_names=[], + output_differentiability=None, + output_differentiability_conditions=None, + ), + True, + ) + + +def match_differentiability_info( + native_functions: list[NativeFunction], + differentiability_infos: dict[FunctionSchema, dict[str, DifferentiabilityInfo]], +) -> list[NativeFunctionWithDifferentiabilityInfo]: + """Sets the "derivative" key on declarations to matching autograd function + In-place functions will use the out-of-place derivative definition if there + is no in-place specific derivative. + """ + + functional_info_by_signature = { + schema.signature(strip_default=True): info_dict + for schema, info_dict in differentiability_infos.items() + if schema.kind() == SchemaKind.functional + } + non_functional_info_by_signature = { + schema.signature(strip_default=True): info_dict + for schema, info_dict in differentiability_infos.items() + if schema.kind() != SchemaKind.functional + } + + def find_info( + f: NativeFunction, + ) -> tuple[dict[str, DifferentiabilityInfo] | None, bool]: + # Don't bother matching info to generated out= variants + if "generated" in f.tags and f.func.kind() == SchemaKind.out: + return None, False + + # (1) Check for an exact match + if f.func in differentiability_infos: + return differentiability_infos[f.func], True + + # (2) If no exact match, check if the out-of-place variant + # of this operator has a match. + # i.e mul() for mul_() or mul_out() + # note(crcrpar): Check foreach or not because in-place foreach functions use backward defined for the existing + # native functions instead of the out-place counterparts. + f_sig = f.func.signature(strip_default=True) + if f_sig in functional_info_by_signature and not is_foreach_func(f): + return functional_info_by_signature[f_sig], False + + # (3) Some operators have a derivative explicitly defined for the mutable + # variant, but get a code-generated out-of-place variant which does *not* + # come with a derivative formula. + # For the generated out-of-place variant, use the mutable variant's formula + # if it exists. + if "generated" in f.tags and f_sig in non_functional_info_by_signature: + info_dict = non_functional_info_by_signature[f_sig] + # See https://github.com/pytorch/pytorch/pull/76320/files#r874816389 + assert not any( + any("self" in str(inpt.nctype.name) for inpt in info.all_saved_inputs) + for info in info_dict.values() + ), f"""\ +Attempted to convert a derivative formula for a mutable operator + to be used by automatically by its functional variant ("{str(f.func)}"). + this is not currently supported (we'd need to fix up the formula in the codegen).""" + return info_dict, False + + # (4) Generate derivative information of foreach functions if none is defined in `derivatives.yaml` + if is_foreach_func(f): + assert f.func not in differentiability_infos + diff_info, is_generated = gen_foreach_derivativeinfo( + f, + functional_info_by_signature, + non_functional_info_by_signature, + ) + if diff_info is None: + return None, False + # TODO(crcrpar): Avoid hard coding "Default" ideally. + diff_info_dict = {"Default": diff_info} + if is_generated: + differentiability_infos[f.func] = diff_info_dict + functional_info_by_signature[f.func] = diff_info_dict + return diff_info_dict, is_generated + + return None, False + + result: list[NativeFunctionWithDifferentiabilityInfo] = [] + for f in native_functions: + info_dict, is_exact_match = find_info(f) + + # Currently, the '.strides()' to 'strides_or_error' replacement does not support + # 'self' derivatives of an inplace function, so we must check for this case. + if f.func.kind() == SchemaKind.inplace and (info_dict is not None): + for info in info_dict.values(): + for derivative in info.derivatives: + if "self" in derivative.var_names: + for saved_input in derivative.saved_inputs: + assert "strides_or_error" not in saved_input.expr, ( + "Calling '.strides()' in the 'self' derivative formula of an " + f"in-place function is not supported: {f.func}" + ) + + if not info_dict: + result.append( + NativeFunctionWithDifferentiabilityInfo( + func=f, info=None, fw_derivatives=None + ) + ) + continue + + fw_derivative_dict: dict[str, Sequence[ForwardDerivative]] = {} + for key, info in info_dict.items(): + if not info.forward_derivatives: + fw_derivative_dict[key] = [] + continue + + forward_derivatives = info.forward_derivatives + + # For functions that have a single def for out-of-place and inplace (like abs()) + if f.func.kind() == SchemaKind.inplace: + # For inplace functions there is a little bit of work to do: + # 1) Validate the formula and make sure the input that is modified in not used: + # - If there is a formula for the inplace variant of the function (is_exact_match == True) then + # we make sure that the original value of the input that is being modified inplace (self_p) is + # not used in the formula. Note that the formula can use "original_self_p" here and that would + # trigger a clone of the original input. + # - If we are re-using the out of place formula (is_exact_match == False) then we replace every + # occurrence of self_p and self_t by original_self_p and original_self_t. These will be + # populated by cloned version of the original input (either the clone done by the backward AD + # logic if self is also used in a backward formula or a special clone that we add). + # 2) At this point, there cannot be a self_p in the formula. + # 3) Change "result" into "self_p" as by design, in the inplace function codegen, the result is + # simply called self (as it is modified inplace). + # 4) Update the required primals data in case it used to contain "result" but should now contain + # "self" + # 5) If it is not an exact match, the user formula is not modifying the existing forward grad + # inplace as it should. So add some code that makes sure that we do so if the forward grad + # already exists. + + assert ( + len(info.forward_derivatives) == 1 + ) # Only single output inplace should exist + fw_info = info.forward_derivatives[0] + formula = fw_info.formula + + def replace_self_with_original_self(formula: str, postfix: str) -> str: + def repl(m: re.Match[str]) -> str: + return f"{m.group(1)}original_self{postfix}{m.group(2)}" + + return re.sub(IDENT_REGEX.format(f"self{postfix}"), repl, formula) + + if re.search(IDENT_REGEX.format("self_p"), formula): + if is_exact_match: + # For manually defined formulas, don't allow the original value to be used + raise RuntimeError( + f'The formula for "{f.func.name}" is using the original value of self ' + "that is being modified inplace. This would lead to wrong forward gradients. " + 'Please use "result" in the formula only.' + ) + else: + # When the original formula is out of place, we save a clone of the primal + # value to be able to access this value if needed + # replace "self_p"/"self_t" from the formula by "original_self_p"/"original_self_t" + formula = replace_self_with_original_self(formula, "_p") + formula = replace_self_with_original_self(formula, "_t") + + # replace "result" from the formula by "self_p" + def repl(m: re.Match[str]) -> str: + return f"{m.group(1)}self_p{m.group(2)}" + + formula = re.sub(IDENT_REGEX.format("result"), repl, formula) + + required_primals = fw_info.required_inputs_primal + if re.search(IDENT_REGEX.format("self_p"), formula): + required_primals = ( + required_primals + ("self",) if required_primals else ("self",) + ) + + if not is_exact_match: + # NOTE [In-place forward AD formula Optimization] + # + # This optimization transforms the formula to directly do inplace, i.e. + # instead of self_t.copy_(self_t.op()) we do self_t.op_() when the following are met: + # + # 1) the formula satisfies the pattern: "self_t.op(*args)" + # 2) "op" in (1) needs to be the same as the op the derivative is for + # + # (2) may seem too strict, but currently the only ops that satisfy (1) also satisfy (2) + # If there is a need, we can relax (2) to allow any op that has an in-place variant + is_single_method_on_self_t = False + directly_do_inplace = False + op_name: str | None = None + between_parens: str | None = None + match = re.fullmatch(r"self_t.([\w]*)\((.*)\)", formula) + if match: + op_name, between_parens = match.group(1), match.group(2) + + # We want to... + # Match: self_t.op1(other_p.op2(arg)) + # Avoid: self_t.op1(args) + self_t.op2(args) + # Avoid: self_t.op1(other_p.op2(arg)) + self_t.op2(args) + def check_parens_nest_level_gt_zero(s: str) -> bool: + level = 1 + for ch in s: + if ch == ")": + level -= 1 + if level == 0: + return False + if ch == "(": + level += 1 + return True + + is_single_method_on_self_t = check_parens_nest_level_gt_zero( + between_parens + ) + directly_do_inplace = ( + is_single_method_on_self_t and op_name == info.name + ) + + if directly_do_inplace: + assert op_name is not None + assert between_parens is not None + formula = f"self_t_raw.defined() ? self_t_raw.{op_name}_({between_parens}) : {formula}" + else: + # Make sure that the forward grad is modified inplace when the original formula + # is out of place + formula = f"self_t_raw.defined() ? self_t_raw.copy_({formula}) : {formula}" + + required_original_self_value = bool( + re.search(IDENT_REGEX.format("original_self_p"), formula) + ) or bool(re.search(IDENT_REGEX.format("original_self_t"), formula)) + + forward_derivatives = [ + ForwardDerivative( + formula=formula, + var_names=("self",), + var_types=fw_info.var_types, + required_inputs_fw_grad=fw_info.required_inputs_fw_grad, + required_inputs_primal=required_primals, + required_original_self_value=required_original_self_value, + is_reusing_outplace_formula=not is_exact_match, + ), + ] + + fw_derivative_dict[key] = forward_derivatives + + result.append( + NativeFunctionWithDifferentiabilityInfo( + func=f, info=info_dict, fw_derivatives=fw_derivative_dict + ) + ) + + return result + + +def is_differentiable( + name: str, type: Type, info: DifferentiabilityInfo | None +) -> bool: + return type.is_tensor_like() and ( + info is None or name not in info.non_differentiable_arg_names + ) + + +def gen_differentiable_outputs( + fn: NativeFunctionWithDifferentiabilityInfo, key: str = "Default" +) -> list[DifferentiableOutput]: + f = fn.func + info = fn.info[key] if fn.info else None + outputs: list[DifferentiableOutput] = [ + DifferentiableOutput( + name=name, + type=ret.type, + cpp_type=cpp.return_type(ret, symint=True).cpp_type(), + ) + for name, ret in zip(cpp.return_names(f), f.func.returns) + ] + output_differentiability = info.output_differentiability if info else None + if output_differentiability is not None: + if len(output_differentiability) != len(outputs): + raise RuntimeError( + f"The length of output_differentiability ({len(output_differentiability)}), " + f"does not match the number of outputs ({len(outputs)})." + ) + differentiable_outputs: list[DifferentiableOutput] = [] + if False in output_differentiability and f.func.kind() == SchemaKind.inplace: + raise RuntimeError( + "output_differentiability=False for inplace operation (version_counter won't get updated)" + ) + for differentiable, output in zip(output_differentiability, outputs): + if differentiable: + differentiable_outputs.append(output) + return differentiable_outputs + candidate_differentiable_outputs = list( + filter(lambda r: is_differentiable(r.name, r.type, info), outputs) + ) + if uses_single_grad(info): + return candidate_differentiable_outputs[:1] + else: + return candidate_differentiable_outputs diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py new file mode 100644 index 00000000000..c657570ee3e --- /dev/null +++ b/torchgen/api/cpp.py @@ -0,0 +1,472 @@ +from __future__ import annotations + +from typing import Sequence + +from torchgen import local +from torchgen.api.types import ( + ArgName, + ArrayCType, + ArrayRefCType, + BaseCType, + BaseTypeToCppMapping, + Binding, + boolT, + ConstRefCType, + CType, + dimnameListT, + intArrayRefT, + iTensorListRefT, + ListCType, + longT, + MutRefCType, + NamedCType, + OptionalCType, + optionalIntArrayRefT, + optionalSymIntArrayRefT, + scalarT, + SpecialArgName, + symIntArrayRefT, + SymIntT, + tensorListT, + tensorOptionsT, + tensorT, + TupleCType, + VectorCType, + voidT, +) +from torchgen.model import ( + Argument, + Arguments, + BaseTy, + BaseType, + FunctionSchema, + ListType, + NativeFunction, + OptionalType, + Return, + SelfArgument, + TensorOptionsArguments, + Type, +) +from torchgen.utils import assert_never + + +# This file describes the translation of JIT schema to the public C++ +# API, which is what people use when they call functions like at::add. +# +# Prominent characteristics of the C++ API: +# +# - dtype, layout, device and pin_memory are collected into +# a single C++ type TensorOptions (the native functions API +# also has this, but tensor options is really most relevant +# for the C++ API; it makes calling kwarg factory functions +# pleasant) +# +# - defaulting lives here (in fact, the dispatcher is completely +# oblivious of defaults!) +# +# BTW: policy on name collisions: we try not to have types with +# collisions, but functions are fair game to collide + + +def name( + func: FunctionSchema, + *, + faithful_name_for_out_overloads: bool = False, + symint_overload: bool = False, +) -> str: + name = str(func.name.name) + if symint_overload: + name += "_symint" + if func.is_out_fn(): + if faithful_name_for_out_overloads: + name += "_outf" + else: + name += "_out" + + return name + + +# Translation of "value types" in JIT schema to C++ API type. Value +# types look the same no matter if they are argument types or return +# types. Returns None if the type in question is not a value type. +def valuetype_type( + t: Type, + *, + binds: ArgName, + mutable: bool = True, + remove_non_owning_ref_types: bool = False, + symint: bool = False, +) -> NamedCType | None: + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar: + return None + elif str(t) == "SymInt": + if symint: + return NamedCType(binds, BaseCType(SymIntT)) + else: + return NamedCType(binds, BaseCType(longT)) + if remove_non_owning_ref_types: + if t.name == BaseTy.str: + raise AssertionError( + "string ref->value conversion: not implemented yet" + ) + # All other BaseType currently map directly to BaseCppTypes. + return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name])) + elif isinstance(t, OptionalType): + elem = valuetype_type(t.elem, binds=binds, mutable=mutable, symint=symint) + if elem is None: + return None + return NamedCType(binds, OptionalCType(elem.type)) + elif isinstance(t, ListType): + if str(t.elem) == "bool": + assert t.size is not None + return NamedCType(binds, ArrayCType(BaseCType(boolT), t.size)) + else: + return None + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +# Translation of types occurring in JIT arguments to a C++ argument type. +# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type. +# For example, we'll return std::vector instead of IntArrayRef. +# See Note [translation from C++ reference to value types] +def argumenttype_type( + t: Type, + *, + mutable: bool, + binds: ArgName, + remove_non_owning_ref_types: bool = False, + symint: bool = False, +) -> NamedCType: + # If it's a value type, do the value type translation + r = valuetype_type( + t, + binds=binds, + mutable=mutable, + symint=symint, + remove_non_owning_ref_types=remove_non_owning_ref_types, + ) + if r is not None: + return r + + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + if mutable and not local.use_const_ref_for_mutable_tensors(): + return NamedCType(binds, MutRefCType(BaseCType(tensorT))) + else: + return NamedCType(binds, ConstRefCType(BaseCType(tensorT))) + elif t.name == BaseTy.Scalar: + return NamedCType(binds, ConstRefCType(BaseCType(scalarT))) + else: + raise AssertionError(f"base type should have been value type {t}") + elif isinstance(t, OptionalType): + if str(t.elem) == "Tensor": + if mutable and not local.use_const_ref_for_mutable_tensors(): + return NamedCType( + binds, MutRefCType(BaseCType(tensorT)) + ) # TODO: fix this discrepancy + else: + return NamedCType( + binds, ConstRefCType(OptionalCType(BaseCType(tensorT))) + ) + elif str(t.elem) == "Scalar": + return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT)))) + elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int": + return NamedCType(binds, BaseCType(optionalIntArrayRefT)) + elif isinstance(t.elem, ListType) and str(t.elem.elem) == "SymInt": + if symint: + return NamedCType(binds, BaseCType(optionalSymIntArrayRefT)) + else: + return NamedCType(binds, BaseCType(optionalIntArrayRefT)) + elem = argumenttype_type(t.elem, mutable=mutable, binds=binds, symint=symint) + return NamedCType(binds, OptionalCType(elem.type)) + elif isinstance(t, ListType): + # TODO: remove these special cases, ArrayRef fallthrough works fine + if str(t.elem) == "int": + if remove_non_owning_ref_types: + return NamedCType(binds, VectorCType(BaseCType(longT))) + else: + return NamedCType(binds, BaseCType(intArrayRefT)) + if str(t.elem) == "SymInt": + if remove_non_owning_ref_types: + if symint: + return NamedCType(binds, VectorCType(BaseCType(SymIntT))) + else: + return NamedCType(binds, VectorCType(BaseCType(longT))) + else: + if symint: + return NamedCType(binds, BaseCType(symIntArrayRefT)) + else: + return NamedCType(binds, BaseCType(intArrayRefT)) + if str(t.elem) == "Tensor": + if local.use_ilistref_for_tensor_lists(): + return NamedCType(binds, ConstRefCType(BaseCType(iTensorListRefT))) + else: + return NamedCType(binds, BaseCType(tensorListT)) + elif str(t.elem) == "Scalar": + return NamedCType(binds, ArrayRefCType(BaseCType(scalarT))) + elif str(t.elem) == "Dimname": + return NamedCType(binds, BaseCType(dimnameListT)) + elif str(t.elem) == "Tensor?": + return NamedCType( + binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))) + ) + elem = argumenttype_type(t.elem, mutable=mutable, binds=binds, symint=symint) + return NamedCType(binds, ArrayRefCType(elem.type)) + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +# Translate a JIT argument into its C++ type +def argument_type(a: Argument, *, binds: ArgName, symint: bool = False) -> NamedCType: + return argumenttype_type(a.type, mutable=a.is_write, symint=symint, binds=binds) + + +# Translation of a (non-multi) return type from JIT to C++ +# N.B: returntype_type returns a CType, not a NamedCType. +# This is mostly because of the mismatch between return types and return names. +# e.g. a function with a return type of 'void' has 0 return names, +# and a function with a return type of 'std::tuple' has >1 return name. +def returntype_type(t: Type, *, mutable: bool, symint: bool = False) -> CType: + # placeholder is ignored + # NB: symint is ALWAYS respected for return types. So symint argument + # here is IGNORED + r = valuetype_type(t, binds="__placeholder__", mutable=mutable, symint=True) + if r is not None: + return r.type + + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + if mutable: + if local.use_const_ref_for_mutable_tensors(): + return ConstRefCType(BaseCType(tensorT)) + else: + return MutRefCType(BaseCType(tensorT)) + else: + # Note [Tensor Copy Returns] + # Currently, we use "Argument.is_write" to determine + # whether or not Tensor return types should be copies or references. + # If that ever changes, take a look at other locations of this note! + return BaseCType(tensorT) + elif t.name == BaseTy.Scalar: + return BaseCType(scalarT) + elif isinstance(t, ListType): + assert ( + not mutable + ), "Native functions should never return a mutable tensor list. They should return void." + elem = returntype_type(t.elem, mutable=False) + assert t.size is None, f"fixed size list returns not supported: {t}" + return VectorCType(elem) + elif isinstance(t, OptionalType): + elem = returntype_type(t.elem, mutable=mutable) + if str(t.elem) == "Tensor": + return OptionalCType(elem) + + raise AssertionError(f"unrecognized return type {t}") + + +# Translation of a single return to its C++ type +def return_type(r: Return, *, symint: bool = False) -> CType: + return returntype_type(r.type, mutable=r.is_write, symint=symint) + + +# Translation of a full (possibly multi) return from JIT to its C++ type +def returns_type(rs: Sequence[Return], *, symint: bool = False) -> CType: + if len(rs) == 0: + return BaseCType(voidT) + elif len(rs) == 1: + return return_type(rs[0], symint=symint) + else: + return TupleCType([return_type(r, symint=symint) for r in rs]) + + +def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]: + returns: list[str] = [] + for i, r in enumerate(f.func.returns): + # If we have an inplace function, the return argument is + # implicitly named self. + # TODO: Consider incorporating this into the data model + if f.func.name.name.inplace: + assert i == 0, "illegal inplace function with multiple returns" + name = "self" + # If we are out function, the name is the name of the + # corresponding output function (r.name will get recorded + # in field_name later.) + elif f.func.is_out_fn(): + name = f.func.arguments.out[i].name + # If the return argument is explicitly named... + elif r.name: + name_conflict = any( + r.name == a.name for a in f.func.schema_order_arguments() + ) + if name_conflict and not f.func.is_out_fn(): + name = f"{r.name}_return" + else: + name = r.name + # If there is no explicit name and no fallback name was passed in, we just name the output result, + # unless it's a multi-return, in which case it's result0, + # result1, etc (zero-indexed) + else: + name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}" + returns.append(name) + return returns + + +JIT_TO_CPP_DEFAULT = { + "False": "false", + "True": "true", + "None": "::std::nullopt", # UGH this one is type directed + "Mean": "at::Reduction::Mean", + "[]": "{}", + "contiguous_format": "c10::MemoryFormat::Contiguous", + "long": "at::kLong", +} + + +# Convert a JIT default into C++ expression representing the default +def default_expr(d: str, t: Type, *, symint: bool) -> str: + if d == "None" and str(t) == "Tensor?": + return "{}" + if isinstance(t, BaseType) and t.name is BaseTy.str: + # Schema allows single quotes but C++ needs double + if len(d) >= 2 and d[0] == "'" and d[-1] == "'": + s = "" + i = 1 + while i + 1 < len(d): + if d[i] != "\\": + if d[i] == '"': + s += '\\"' + else: + s += d[i] + i += 1 + else: + if d[i + 1] == "'": + s += "'" + else: + s += d[i : i + 2] + i += 2 + + return f'"{s}"' + + if isinstance(t, OptionalType): + if d == "None": + return "::std::nullopt" + + return default_expr(d, t.elem, symint=symint) + + if isinstance(t, ListType): + if d.startswith("[") and d.endswith("]"): + return "{" + d[1:-1] + "}" + elif symint and d.isdigit() and str(t.elem) == "SymInt": + return f"c10::SymInt({d})" + elif t.size is None: + # NOTE: Sized lists can have scalar defaults + raise ValueError(f"Expected a list default '[...]' but found: '{d}'") + + return JIT_TO_CPP_DEFAULT.get(d, d) + + +# Convert an argument into its C++ API form + + +def argument( + a: Argument | TensorOptionsArguments | SelfArgument, + *, + cpp_no_default_args: set[str], + method: bool, + faithful: bool, + symint: bool = False, + has_tensor_options: bool, +) -> list[Binding]: + def sub_argument( + a: Argument | TensorOptionsArguments | SelfArgument, + ) -> list[Binding]: + return argument( + a, + cpp_no_default_args=cpp_no_default_args, + method=method, + faithful=faithful, + symint=symint, + has_tensor_options=has_tensor_options, + ) + + if isinstance(a, Argument): + binds: ArgName + if a.name == "memory_format" and has_tensor_options: + binds = SpecialArgName.possibly_redundant_memory_format + else: + binds = a.name + default: str | None = None + if a.name not in cpp_no_default_args and a.default is not None: + default = default_expr(a.default, a.type, symint=symint) + return [ + Binding( + nctype=argument_type(a, binds=binds, symint=symint), + name=a.name, + default=default, + argument=a, + ) + ] + elif isinstance(a, TensorOptionsArguments): + if faithful: + return ( + sub_argument(a.dtype) + + sub_argument(a.layout) + + sub_argument(a.device) + + sub_argument(a.pin_memory) + ) + else: + default = None + # Enforced by NativeFunction.__post_init__ + assert "options" not in cpp_no_default_args + if all(x.default == "None" for x in a.all()): + default = "{}" + elif a.dtype.default == "long": + default = "at::kLong" # TODO: this is wrong + return [ + Binding( + nctype=NamedCType("options", BaseCType(tensorOptionsT)), + name="options", + default=default, + argument=a, + ) + ] + elif isinstance(a, SelfArgument): + if method: + # Caller is responsible for installing implicit this in context! + return [] + else: + return sub_argument(a.argument) + else: + assert_never(a) + + +def arguments( + arguments: Arguments, + *, + faithful: bool, + symint: bool = False, + method: bool, + cpp_no_default_args: set[str], +) -> list[Binding]: + args: list[Argument | TensorOptionsArguments | SelfArgument] = [] + if faithful: + args.extend(arguments.non_out) + args.extend(arguments.out) + else: + args.extend(arguments.out) + args.extend(arguments.non_out) + return [ + r.no_default() if faithful else r + for a in args + for r in argument( + a, + faithful=faithful, + symint=symint, + method=method, + has_tensor_options=arguments.tensor_options is not None, + cpp_no_default_args=cpp_no_default_args, + ) + ] diff --git a/torchgen/api/dispatcher.py b/torchgen/api/dispatcher.py new file mode 100644 index 00000000000..103e6cf4299 --- /dev/null +++ b/torchgen/api/dispatcher.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import itertools +from typing import Sequence + +from torchgen.api import cpp +from torchgen.api.types import ArgName, Binding, CType, NamedCType +from torchgen.model import ( + Argument, + FunctionSchema, + Return, + SelfArgument, + TensorOptionsArguments, + Type, +) +from torchgen.utils import assert_never, concatMap + + +# This file describes the translation of JIT schema to the dispatcher +# API, the *unboxed* calling convention by which invocations through +# the dispatcher are made. Historically, the dispatcher API matched +# the C++ API, but with the establishment of the boxed API, we've +# made changes to the dispatcher API to so that the unboxed API +# better aligns with the boxed API. The dispatcher API hooks heavily +# into our template based boxing/unboxing machinery, so changes +# to this convention will usually need template updates too. +# +# Prominent characteristics of the dispatcher API: +# +# - dtype, layout, device and pin_memory are represented as separate +# arguments. +# + + +def name(func: FunctionSchema) -> str: + return cpp.name(func) + + +def argumenttype_type( + t: Type, + *, + mutable: bool, + binds: ArgName, + remove_non_owning_ref_types: bool = False, + symint: bool = True, +) -> NamedCType: + # This is a faux amis. If it makes sense in the future to add + # more special cases here, or invert things so cpp.argument_type + # calls this, or just completely inline the function, please do + # it. + return cpp.argumenttype_type( + t, + mutable=mutable, + binds=binds, + symint=symint, + remove_non_owning_ref_types=remove_non_owning_ref_types, + ) + + +def argument_type( + a: Argument, + *, + binds: ArgName, + remove_non_owning_ref_types: bool = False, + symint: bool = True, +) -> NamedCType: + return argumenttype_type( + a.type, + mutable=a.is_write, + binds=binds, + remove_non_owning_ref_types=remove_non_owning_ref_types, + symint=symint, + ) + + +def returns_type(rs: Sequence[Return], *, symint: bool = True) -> CType: + # At present, there is no difference. But there could be! + return cpp.returns_type(rs, symint=symint) + + +def jit_arguments(func: FunctionSchema) -> list[Argument]: + def to_argument( + a: Argument | TensorOptionsArguments | SelfArgument, + ) -> list[Argument]: + if isinstance(a, Argument): + return [a] + elif isinstance(a, SelfArgument): + return [a.argument] + elif isinstance(a, TensorOptionsArguments): + return [a.dtype, a.layout, a.device, a.pin_memory] + else: + assert_never(a) + + return list( + concatMap( + to_argument, + itertools.chain( + func.arguments.positional, func.arguments.kwarg_only, func.arguments.out + ), + ) + ) + + +def argument( + a: Argument, *, remove_non_owning_ref_types: bool = False, symint: bool = True +) -> Binding: + return Binding( + nctype=argument_type( + a, + binds=a.name, + remove_non_owning_ref_types=remove_non_owning_ref_types, + symint=symint, + ), + name=a.name, + argument=a, + ) + + +def arguments(func: FunctionSchema, *, symint: bool = True) -> list[Binding]: + return [argument(a, symint=symint) for a in jit_arguments(func)] diff --git a/torchgen/api/functionalization.py b/torchgen/api/functionalization.py new file mode 100644 index 00000000000..93667e39b17 --- /dev/null +++ b/torchgen/api/functionalization.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +from torchgen.api import dispatcher +from torchgen.api.types import ( + BaseCppType, + BaseCType, + Binding, + boolT, + ConstRefCType, + CType, + longT, + NamedCType, + tensorT, +) +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + FunctionSchema, + NativeFunction, + NativeFunctionsViewGroup, +) + + +# This file describes the translation of JIT schema to API's used +# when creating view lambdas that are used by the functionalization pass. +# There are two types of lambdas: forward lambdas and reverse lambdas. +# These API's mostly follow the dispatcher API, with a few quirks: +# - The lambda capture has to convert reference types to value types +# - While the forward lambda just directly calls into the at::_ops API +# (following the dispatcher convention), the logic here for the reverse lambda +# is responsible for generating both the call-site, and the declarations +# (which are implemented manually in the at::functionalization::impl namespace). + +# The lambdas generated for each view op in the functionalization pass are of the form +# [capture_arguments](outer_arguments) -> returns_type { +# return name(inner_arguments); +# } + +# Define some specific lambda input arguments. +base_binding = Binding( + name="base", + nctype=NamedCType(name="base", type=ConstRefCType(BaseCType(tensorT))), + argument=Argument( + name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None + ), + default=None, +) +mutated_view_binding = Binding( + name="mutated_view", + nctype=NamedCType(name="mutated_view", type=ConstRefCType(BaseCType(tensorT))), + argument=Argument( + name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None + ), + default=None, +) +mutated_view_idx_binding = Binding( + name="mutated_view_idx", + nctype=NamedCType(name="mutated_view_idx", type=BaseCType(longT)), + argument=Argument( + name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None + ), + default=None, +) +reapply_views_binding = Binding( + name="reapply_views", + nctype=NamedCType(name="reapply_views", type=BaseCType(boolT)), + argument=Argument( + name="reapply_views", type=BaseType(BaseTy.bool), default=None, annotation=None + ), + default=None, +) + +InverseReturnModeT = BaseCppType("at::functionalization", "InverseReturnMode") +inverse_return_mode_binding = Binding( + name="inverse_return_mode", + nctype=NamedCType(name="inverse_return_mode", type=BaseCType(InverseReturnModeT)), + argument=Argument( + name="inverse_return_mode", + # NB: not actually a bool but it doesn't matter because this isn't used + type=BaseType(BaseTy.bool), + default=None, + annotation=None, + ), + default=None, +) + + +# The lambda capture itself doesn't have a name. +# The name returned here corresponds to the name of the inner function called by the lambda. +def name( + g: NativeFunctionsViewGroup, + *, + is_reverse: bool, + include_namespace: bool, + reapply_views: bool | None = None, +) -> str: + if reapply_views is None: + # reapply_views is only important for the fwd lambda, + # since we always plumb the runtime "reapply_views" argument into the reverse function. + assert is_reverse + if is_reverse: + return reverse_name(g.view, include_namespace) + # in the forward case, we just directly call into the at::_ops API (so we always need the namespace) + assert include_namespace + assert g.view_copy is not None + api_name = ( + g.view.func.name.unambiguous_name() + if reapply_views + else g.view_copy.func.name.unambiguous_name() + ) + return f"at::_ops::{api_name}::call" + + +def reverse_name(f: NativeFunction, include_namespace: bool) -> str: + # for the reverse: we plumb the "reapply_views" flag into that function and support + # both copy and non-copy variants. (We could avoid doing that, but that would require + # writing out twice as many view inverse functions). + api_name = f.func.name.unambiguous_name() + # in the reverse case, we codegen both the call-sites (which need the full namespace) and the declarations (which don't) + if include_namespace: + return f"at::functionalization::FunctionalInverses::{api_name}_inverse" + else: + return f"{api_name}_inverse" + + +def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> list[Binding]: + # capture arguments include all arguments except `self`. + # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture), + # So any reference types (IntArrayRef) need to be converted to value types (vector) + args = func.arguments.flat_all + assert args[0].type == BaseType(BaseTy.Tensor) + non_self_args = args[1:] + non_self_value_bindings = [ + dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args + ] + + all_bindings = [ + inverse_return_mode_binding if is_reverse else reapply_views_binding + ] + all_bindings.extend(non_self_value_bindings) + return all_bindings + + +def returns_type(func: FunctionSchema) -> CType: + # Assertion: all view ops return tensor-like outputs + assert len(func.returns) >= 1 + for ret in func.returns: + assert ret.type.is_tensor_like() + # However, the return type of the lambda is always an individual tensor. + # For multi-tensor outputs, each tensor needs to be tracked individually. + return BaseCType(tensorT) + + +def outer_arguments(*, is_reverse: bool) -> list[Binding]: + if is_reverse: + return [base_binding, mutated_view_binding, mutated_view_idx_binding] + else: + return [base_binding, mutated_view_idx_binding] + + +def inner_call_index(func: FunctionSchema) -> Binding | None: + # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output. + # When we replay a view op that returns multiple tensors, we need to index into the output appropriately + if len(func.returns) > 1 or ( + len(func.returns) == 1 and func.returns[0].type.is_list_like() + ): + return mutated_view_idx_binding + return None + + +def inner_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]: + args = func.arguments.flat_all + assert args[0].type == BaseType(BaseTy.Tensor) + non_self_args = args[1:] + # The forward lambda calls the at::_ops API, while the reverse lambda calls the view inverse API. + # Both of these follow the dispatcher API. + non_self_bindings = [dispatcher.argument(a) for a in non_self_args] + if not is_reverse: + # the forward lambda swaps out the original tensor argument with the lambd arg "base" + return [base_binding] + non_self_bindings + else: + # the reverse lambda does the same, but with an additional "mutated_view" arg + # additionally, we have a calling convention: for view ops that return multiple tensor outputs + # their corresponding view_inverse function takes in an additional index argument. + index_binding = inner_call_index(func) + if index_binding is not None: + return [ + base_binding, + mutated_view_binding, + inverse_return_mode_binding, + index_binding, + ] + non_self_bindings + else: + return [ + base_binding, + mutated_view_binding, + inverse_return_mode_binding, + ] + non_self_bindings diff --git a/torchgen/api/lazy.py b/torchgen/api/lazy.py new file mode 100644 index 00000000000..cfffa516b65 --- /dev/null +++ b/torchgen/api/lazy.py @@ -0,0 +1,467 @@ +from __future__ import annotations + +from typing import Any + +from torchgen.api.types import ( + BaseCppType, + BaseCType, + boolT, + CType, + deviceT, + doubleT, + generatorT, + layoutT, + ListCType, + longT, + memoryFormatT, + NamedCType, + OptionalCType, + scalarT, + scalarTypeT, + stringT, + SymIntT, + VectorCType, +) +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + FunctionSchema, + ListType, + OperatorName, + OptionalType, + Return, + TensorOptionsArguments, + Type, +) + + +_valueT: BaseCppType | None = None + + +# A ValueT is an IR type which represents the computation of a Tensor. In other +# words, a PyTorch user will do operations on lazy tensors, and each output lazy +# tensor internally tracks a ValueT representing the IR node that would have +# actually produced the value of this tensor for real. +# +# This is configurable because different lazy tensor backends (LTC vs XLA) will +# have different IR representations. (Though, arguably, after unification they +# shouldn't!) +def getValueT() -> BaseCppType: + global _valueT + if not _valueT: + raise NotImplementedError( + "The value type needs to be set with setValueT() in run_gen_lazy_tensor()" + ) + + return _valueT + + +def setValueT(val: BaseCppType) -> None: + global _valueT + _valueT = val + + +# this is a bad hack. I need to refactor the data model to represent each arg in the schema as an object, +# making it easier to represent special properties of an arg. +tensorListValueT = BaseCppType("torch::lazy", "Value") + + +def process_ir_type( + typ: Type, properties: LazyIrProperties, *, symint: bool +) -> BaseCType | VectorCType | OptionalCType | ListCType: + """ + This function takes a type from NativeFunctions and converts it for use with + lazy tensor codegen. + + Type conversion for lazy currently consists of + (1) changing at::Tensors into lazy::Values + (2) wrapping everything in a BaseCType + (3) making cpp-reference types into cpp-value types (e.g. vector instead of IntArrayRef) + + (1) converts at::Tensors to lazy::Values (which wrap lazy::Nodes, with which Lazy IR represents tensors.) + There is special handling for Optional[Tensor] or List[Tensor], etc- hence 'tensor-like' + + This is incomplete- there are assertions in places that it's expected to need to add + more types as the codegen is used with more operators. + """ + if isinstance(typ, BaseType): + if typ.name == BaseTy.Tensor: + return BaseCType(getValueT()) + elif typ.name == BaseTy.Scalar: + if properties.TreatScalarsAsConstants: + return BaseCType(scalarT) + # at::scalar has special handling, + # and is wrapped in an lazy::Value just like at::tensor + return BaseCType(getValueT()) + elif typ.name == BaseTy.ScalarType: + return BaseCType(scalarTypeT) + elif typ.name == BaseTy.int: + return BaseCType(longT) + elif typ.name == BaseTy.SymInt: + if symint: + return BaseCType(getValueT()) + else: + return BaseCType(longT) + elif typ.name == BaseTy.bool: + return BaseCType(boolT) + elif typ.name == BaseTy.float: + return BaseCType(doubleT) + elif typ.name == BaseTy.str: + return BaseCType(stringT) + elif typ.name == BaseTy.Device: + return BaseCType(deviceT) + elif typ.name == BaseTy.Generator: + return BaseCType(generatorT) + elif typ.name == BaseTy.Layout: + return BaseCType(layoutT) + elif typ.name == BaseTy.MemoryFormat: + return BaseCType(memoryFormatT) + else: + raise AssertionError(f"TODO add support for type {repr(typ)}") + elif isinstance(typ, OptionalType): + return OptionalCType(process_ir_type(typ.elem, properties, symint=symint)) + elif isinstance(typ, ListType): + if str(typ.elem) == "Tensor?": + # TODO(whc) is this actually correct? or should it use a Vector like above + return ListCType(OptionalCType(BaseCType(getValueT()))) + elif str(typ.elem) == "Tensor": + # this is a TensorList which comes in from GetTensorList as a Value + return BaseCType(tensorListValueT) + elif typ.elem == BaseType(BaseTy.SymInt): + # TODO: return a value type. The problem here is analogous to + # the problem with tensorListValueT: if you have SymInt[] you + # cannot conveniently save the list of Value directly, as nodes + # expect to save values as a vector for ALL arguments. So you + # need a separate IR node that represents all of the size nodes + # assembled into a list. I'm not an LTC dev so I don't want to + # figure it out right now. Y'all figure it out... + return VectorCType(BaseCType(longT)) + + else: + return VectorCType(process_ir_type(typ.elem, properties, symint=symint)) + else: + raise AssertionError(f"unrecognized type {repr(typ)}") + + +# TODO: Determining this based off of CType is bad; this should be computed +# from Type directly; then the same logic as process_ir_type can be used +# +# Invariant: passed typ should be an *owning* CType (e.g., we will report +# that ArrayRef is NOT a value type) +def isValueType(typ: CType, properties: LazyIrProperties | None = None) -> bool: + """ + Given a type, determine if it is a Value-like type. This is equivalent to + being Tensor-like, but assumes the type has already been transformed. + """ + if isinstance(typ, BaseCType): + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + treat_scalars_as_constants = properties and properties.TreatScalarsAsConstants + return ( + typ.type == getValueT() + or (typ.type == scalarT and not treat_scalars_as_constants) + or typ.type == SymIntT + ) + elif typ == VectorCType(BaseCType(SymIntT)): + # TODO: report True for this + return False + elif isinstance(typ, (OptionalCType, ListCType, VectorCType)): + return isValueType(typ.elem, properties) + return False + + +def isSymIntType(typ: Type) -> bool: + return isinstance(typ, BaseType) and typ.name == BaseTy.SymInt + + +def isWrappedScalarType(typ: Type) -> bool: + """ + Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value. + Since we literally change the type from scalarT to valueT, information is lost. + This function helps build a list of wrapped scalars to save that information + """ + if isinstance(typ, BaseType): + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + return typ.name == BaseTy.Scalar + elif isinstance(typ, (OptionalType, ListType)): + return isWrappedScalarType(typ.elem) + return False + + +# TODO: dedupe with Type.is_generator_like +def isGeneratorType(typ: Type) -> bool: + if isinstance(typ, BaseType): + return typ.name == BaseTy.Generator + elif isinstance(typ, (OptionalType)): + return isGeneratorType(typ.elem) + return False + + +# This class caches a few derived properties computed from an Argument +# and LazyIrProperties +class LazyArgument: + name: str + orig_type: Type + lazy_type_: CType | None + is_wrapped_scalar: bool + is_generator: bool + # TODO: this is lies, it is false for symint list + is_symint_or_list: bool + + # Whether or not we are treating this as symint or not + symint: bool + + # true if this argument is or contains a lazy IR value + is_lazy_value: bool + + def __init__( + self, arg: Argument, properties: LazyIrProperties, *, symint: bool + ) -> None: + self.name = arg.name + self.orig_type = arg.type + self.symint = symint + self.is_optional = isinstance(arg.type, OptionalType) + self.is_generator = isGeneratorType(arg.type) + self.lazy_type_ = process_ir_type(arg.type, properties, symint=symint) + self.is_wrapped_scalar = isWrappedScalarType(arg.type) + self.is_symint_or_list = symint and ( + isSymIntType(arg.type) + or (isinstance(arg.type, OptionalType) and isSymIntType(arg.type.elem)) + # TODO: lists of symints are not currently treated as value types + # or (isinstance(arg.type, ListType) and isSymIntType(arg.type.elem)) + ) + + self.is_lazy_value = isValueType(self.lazy_type, properties) + + @property + def lazy_type(self) -> CType: + assert ( + self.lazy_type_ is not None + ), f"Attempted to access lazy_type for invalid argument {self.name}" + return self.lazy_type_ + + +class LazyIrProperties: + """Collection of properties for an IR node + + The property groups are listed below. Each group is mutually + exclusive, meaning that only one property from each group can be True + at any one time. The properties can be accessed as if they were normal + attributes. The mutual exclusivity is automatically handled. + """ + + Properties: tuple[tuple[str, ...], ...] = ( + ( + "ShapePrecompute", # Assume shape has been precomputed + "ShapeCompute", # Need to compute the shape on construction + "ShapeCache", # Utilize the shape cache to defer computation + ), + ( + "Lower", # Codegen full lower function + "LowerDeclOnly", # Codegen only lower function declaration + ), + ( + "CanBeReused", # Codegen full reuse function + "CanBeReusedDeclOnly", # Codegen only reuse function declaration + ), + ( + "CreateFn", # Codegen full create function + "CreateFnDeclOnly", # Codegen only create function declaration + ), + ( + "TreatScalarsAsConstants", # Treat Scalars as constants instead of handling like values + ), + ) + + def __init__(self, *default_properties: str) -> None: + properties: dict[tuple[str, ...], str | None] = dict.fromkeys( + LazyIrProperties.Properties + ) + self.__dict__["properties"] = properties + for p in default_properties: + setattr(self, p, True) + + def __getattr__(self, key: str) -> Any: + properties = self.__dict__["properties"] + for values in LazyIrProperties.Properties: + if key in values: + return properties[values] == key + + return self.__getattribute__(key) + + def __setattr__(self, key: str, value: Any) -> Any: + properties = self.__dict__["properties"] + for values in LazyIrProperties.Properties: + if key in values: + properties[values] = key if value else None + return value + + raise KeyError(f"Invalid property: {key}") + + +# Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node. +# Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML), +# but carries type information from a native FunctionSchema modified for use with IR nodes, +# and preserving original argument names. +# +# TODO: This is not idiomatic with how other torchgen APIs transform on schema. +class LazyIrSchema: + # The name of the operator this function schema describes. + name: OperatorName + + positional_args: tuple[LazyArgument, ...] + keyword_args: tuple[LazyArgument, ...] + + # TODO: Need to handle collisions with argument names at some point + returns: tuple[Return, ...] + + # if this schema has a Generator arg, list its orig ctype/name but don't + # build a LazyArgument since lazy IR doesn't support it + generator_arg: NamedCType | None = None + + # original function schema + func: FunctionSchema + + # Whether or not we are code-genning for SymInt or not + symint: bool + + properties: LazyIrProperties = LazyIrProperties( + # default properties + "ShapePrecompute", + "Lower", + "CanBeReused", + ) + opkind: str | None = None + + def __init__( + self, + func: FunctionSchema, + properties: LazyIrProperties | None = None, + *, + symint: bool, + ) -> None: + if properties: + self.properties = properties + + self.func = func + self.symint = symint + positional_args: list[LazyArgument] = [] + for arg_field in ["pre_self_positional", "self_arg", "post_self_positional"]: + if arg_field == "self_arg" and func.arguments.self_arg is not None: + arg = func.arguments.self_arg.argument + positional_args.append( + LazyArgument(arg, self.properties, symint=symint) + ) + elif getattr(func.arguments, arg_field) is not None: + positional_args.extend( + LazyArgument(arg, self.properties, symint=symint) + for arg in getattr(func.arguments, arg_field) + ) + self.positional_args = tuple(positional_args) + + keyword_args: list[LazyArgument] = [] + for arg_field in [ + "pre_tensor_options_kwarg_only", + "tensor_options", + "post_tensor_options_kwarg_only", + "out", + ]: + curr_args = getattr(func.arguments, arg_field) + if curr_args is not None: + if isinstance(curr_args, TensorOptionsArguments): + curr_args = curr_args.all() + for arg in curr_args: + if isGeneratorType(arg.type): + assert ( + self.generator_arg is None + ), "We expect there is only one generator arg" + self.generator_arg = NamedCType( + arg.name, arg.type # type:ignore[arg-type] + ) + keyword_args.extend( + LazyArgument(arg, self.properties, symint=symint) + for arg in curr_args + ) + self.keyword_args = tuple(keyword_args) + self.name = func.name + self.returns = func.returns + + @property + def node_name(self) -> str: + """ + Return camel-case version of op in node. + + Note: This function also appends any `overload_name` in the operation. + For example, if the op is `bitwise_and.Tensor`, the returned name + will be `BitwiseAndTensor`. + """ + op_name = f"{self.name.name}_{self.name.overload_name}".lower() + return "".join(word.capitalize() or "" for word in op_name.split("_")) + + @property + def aten_name(self) -> str: + return str(self.name.name) + + @property + def base_name(self) -> str: + return f"{self.name.name.base}" + + def filtered_args( + self, + positional: bool = True, + keyword: bool = True, + values: bool = True, + scalars: bool = True, + generator: bool = True, + ) -> list[LazyArgument]: + # This function maintains the sorted order of arguments but provides different filtered views. + # Some parts of the code care about kwargs vs args (TS lowerings), + # other parts care about whether they need to wrap the arg in a lazy value or leave it alone. + # Generators are special cased, as they are needed for fallback/shape-inference but not supported + # in TS lowerings and therefore also omitted from lazy IR. + args: list[LazyArgument] = [] + if positional: + args.extend(self.positional_args) + if keyword: + args.extend(self.keyword_args) + + if values and scalars and generator: + return args + elif values and scalars: + return [a for a in args if not a.is_generator] + elif values: + return [a for a in args if a.is_lazy_value] + elif scalars: + return [ + a + for a in args + if not a.is_lazy_value and (generator or not a.is_generator) + ] + + return [] + + @property + def positional_values(self) -> list[LazyArgument]: + return self.filtered_args( + positional=True, keyword=False, values=True, scalars=False + ) + + @property + def positional_scalars(self) -> list[LazyArgument]: + return self.filtered_args( + positional=True, keyword=False, values=False, scalars=True + ) + + @property + def keyword_values(self) -> list[LazyArgument]: + return self.filtered_args( + positional=False, keyword=True, values=True, scalars=False + ) + + @property + def keyword_scalars(self) -> list[LazyArgument]: + return self.filtered_args( + positional=False, keyword=True, values=False, scalars=True + ) diff --git a/torchgen/api/meta.py b/torchgen/api/meta.py new file mode 100644 index 00000000000..2e99d151fae --- /dev/null +++ b/torchgen/api/meta.py @@ -0,0 +1,13 @@ +from torchgen.model import NativeFunctionsGroup + + +# Follows dispatcher calling convention, but: +# - Mutable arguments not allowed. Meta functions are always +# written in functional form. Look at FunctionSchema.signature() +# - No tensor returns; instead we return a TensorMeta describing +# the tensor in question + + +def name(g: NativeFunctionsGroup) -> str: + # use the overload name from the functional version + return str(g.functional.func.name).replace(".", "_") diff --git a/torchgen/api/native.py b/torchgen/api/native.py new file mode 100644 index 00000000000..a00e8266b8d --- /dev/null +++ b/torchgen/api/native.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from typing import Sequence + +from torchgen import local +from torchgen.api import cpp +from torchgen.api.types import ( + ArgName, + BaseCType, + Binding, + boolT, + ConstRefCType, + CType, + deviceT, + layoutT, + ListCType, + MutRefCType, + NamedCType, + OptionalCType, + scalarT, + scalarTypeT, + tensorT, +) +from torchgen.model import ( + Argument, + FunctionSchema, + Return, + SelfArgument, + TensorOptionsArguments, + Type, +) +from torchgen.utils import assert_never + + +# This file describes the translation of JIT schema to the native functions API. +# This looks a lot like the C++ API (which makes historical sense, because the +# idea was you wrote native functions to implement functions in the C++ API), +# but over time we have evolved the C++ API without actually changing our +# native:: kernels. The intention is to make native API and dispatcher API +# line up as closely as possible, since this results in the least overhead +# (no translation is needed from dispatcher API to native API). +# +# NB: this is symint aware, you will get the non-SymInt variant for some +# dispatch entries and SymInt for others. + + +def name(func: FunctionSchema) -> str: + name = str(func.name.name) + # TODO: delete this! + if func.is_out_fn(): + name += "_out" + if func.name.overload_name: + name += f"_{func.name.overload_name}" + return name + + +def argumenttype_type( + t: Type, *, mutable: bool, binds: ArgName, symint: bool +) -> NamedCType: + if str(t) == "Tensor?": + tensor_type: OptionalCType = OptionalCType(BaseCType(tensorT)) + if mutable and not local.use_const_ref_for_mutable_tensors(): + return NamedCType(binds, MutRefCType(tensor_type)) + else: + return NamedCType(binds, ConstRefCType(tensor_type)) + elif str(t) == "Tensor?[]": + return NamedCType( + binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))) + ) + elif str(t) == "Scalar": + return NamedCType(binds, ConstRefCType(BaseCType(scalarT))) + elif str(t) == "Scalar?": + return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT)))) + return cpp.argumenttype_type(t, mutable=mutable, binds=binds, symint=symint) + + +def returns_type(rs: Sequence[Return], *, symint: bool) -> CType: + return cpp.returns_type(rs, symint=symint) + + +def argument_type(a: Argument, *, binds: ArgName, symint: bool) -> NamedCType: + return argumenttype_type(a.type, mutable=a.is_write, binds=binds, symint=symint) + + +def argument( + a: Argument | SelfArgument | TensorOptionsArguments, + *, + is_out: bool, + symint: bool, +) -> list[Binding]: + # Ideally, we NEVER default native functions. However, there are a number + # of functions that call native:: directly and rely on the defaulting + # existing. So for BC, we generate defaults for non-out variants (but not + # for out variants, where it is impossible to generate an appropriate + # default) + should_default = not is_out + if isinstance(a, Argument): + default: str | None = None + if should_default and a.default is not None: + default = cpp.default_expr(a.default, a.type, symint=symint) + return [ + Binding( + nctype=argument_type(a, binds=a.name, symint=symint), + name=a.name, + default=default, + argument=a, + ) + ] + elif isinstance(a, SelfArgument): + # Erase SelfArgument from the distinction + return argument(a.argument, is_out=is_out, symint=symint) + elif isinstance(a, TensorOptionsArguments): + default = None + if should_default: + default = "{}" + # TODO: Not sure why the arguments assigned here are for + # TensorOptionsArguments and not the constituent pieces. It seems + # to matter + return [ + Binding( + nctype=NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))), + name="dtype", + default=default, + argument=a, + ), + Binding( + nctype=NamedCType("layout", OptionalCType(BaseCType(layoutT))), + name="layout", + default=default, + argument=a, + ), + Binding( + nctype=NamedCType("device", OptionalCType(BaseCType(deviceT))), + name="device", + default=default, + argument=a, + ), + Binding( + nctype=NamedCType("pin_memory", OptionalCType(BaseCType(boolT))), + name="pin_memory", + default=default, + argument=a, + ), + ] + else: + assert_never(a) + + +def arguments(func: FunctionSchema, *, symint: bool) -> list[Binding]: + args: list[Argument | TensorOptionsArguments | SelfArgument] = [] + args.extend(func.arguments.non_out) + args.extend(func.arguments.out) + return [ + r for arg in args for r in argument(arg, symint=symint, is_out=func.is_out_fn()) + ] diff --git a/torchgen/api/python.py b/torchgen/api/python.py new file mode 100644 index 00000000000..eb0f0748988 --- /dev/null +++ b/torchgen/api/python.py @@ -0,0 +1,1519 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Sequence + +from torchgen.api import cpp +from torchgen.api.types import Binding, CppSignature, CppSignatureGroup +from torchgen.gen import pythonify_default +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + FunctionSchema, + ListType, + NativeFunction, + OptionalType, + Return, + Type, + Variant, +) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Data Models +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# [Notes] python binding codegen +# +# The Python binding codegen produces code that takes the input list of +# PyObjects, finds the matching ATen C++ function using PythonArgParser, +# converts the PyObjects into C++ types and calls the ATen C++ function: +# +# +--------+ parsing +------------------------+ binding +-----------------------+ +# | PyObjs | ---------> | PythonArgParser Output | ---------> | Cpp Function Dispatch | +# +--------+ +------------------------+ +-----------------------+ +# +# The following examples demonstrate the data models the Python binding +# codegen needs to deal with and the tasks it needs to accomplish. It +# helps understand the purpose of the new data types we introduced below. +# +# - Function Schema (source of truth) +# +# aten::empty.names(int[] size, *, Dimname[]? names, +# ScalarType? dtype=None, Layout? layout=None, +# Device? device=None, bool? pin_memory=None, +# MemoryFormat? memory_format=None) -> Tensor +# +# - Python Signature +# +# It's used to generate input schema string for PythonArgParser. +# Note: TensorOptions fields are reordered and the additional +# 'requires_grad' field is added: +# +# empty(IntArrayRef size, *, DimnameList? names, +# MemoryFormat? memory_format=None, ScalarType dtype=None, +# Layout layout=torch.strided, Device device=None, +# bool pin_memory=False, bool requires_grad=False) +# +# - C++ Signature +# +# It's used to generate C++ lambda formals & dispatch call. +# Note: the scattered TensorOptions fields are packed into 'options'. +# +# auto dispatch_empty = +# [](IntArrayRef size, std::optional names, +# const TensorOptions & options, +# std::optional memory_format) -> Tensor { +# pybind11::gil_scoped_release no_gil; +# return torch::empty(size, names, options, memory_format); +# }; +# +# - Binding between Python Arguments and C++ Arguments +# +# Given a set of Python Arguments in scope, we need produce the +# binding expressions that translate the Python API into C++ API: +# +# Python Args Cpp Args Binding Exprs +# ----------------------------------------------------------------- +# 0: size size '_r.intlist(0)' +# 1: names names 'names' [special init] +# 2: memory_format -------+ +# 3: dtype -----+-|--> options 'options' [special packing] +# 4: layout / | +# 5: device / +--> memory_format '_r.memoryformatOptional(2)' +# 6: pin_memory / +# 7: requires_grad -+ +# +# So the full dispatch expression would look like: +# +# dispatch_empty(_r.intlist(0), names, options, +# _r.memoryformatOptional(2)) +# +# Where does 'names' come from? It involves special local init: +# +# auto __names = _r.toDimnameListOptional(1); +# std::optional names = +# __names ? std::make_optional(DimnameList(__names.value())) +# : std::nullopt; +# +# Where does 'options' come from? It involves special local init +# for TensorOptions. Note that Python side has the additional +# 'requires_grad' field: +# +# const auto options = TensorOptions() +# .dtype(_r.scalartype(3)) +# .device(_r.device(5)) +# .layout(_r.layoutOptional(4)) +# .requires_grad(_r.toBool(7)) +# .pinned_memory(_r.toBool(6)); +# +# In some other cases one Python Argument can map to multiple C++ +# Arguments. For example: +# +# aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) +# -> (Tensor values, Tensor indices) +# +# Python Args Cpp Args Binding Exprs +# --------------------------------------------------------------------- +# +----> max 'out[0]' +# /-----> max_values 'out[1] +# 0: input / self '_r.tensor(0)' +# 1: dim / dim '_r.dimname(1)' +# 2: keepdim / keepdim '_r.toBool(2)' +# 3: out -----+ [local init] out '_r.tensorlist_n<2>(3)' +# +# As demonstrated above, the binding can involve reordering, +# packing, unpacking and special local inits. +# +# +# Let's look at a concrete example: +# +# static PythonArgParser parser({ +# "abs(Tensor input, *, Tensor out=None)", +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ^ +# +--- Python Schema, represented by PythonSignature and PythonArgument +# +# }, /*traceable=*/true); +# +# ParsedArgs<2> parsed_args; +# auto _r = parser.parse(nullptr, args, kwargs, parsed_args); +# +# ... +# +# if (_r.isNone(1)) { +# ~~~~~~~~~~~~ <--- Scattered PythonArgParser output (arg name = 'out') +# represented by PythonArgParserOutputExpr +# +# // aten::abs(Tensor self) -> Tensor +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ^ +# +--- NativeFunction schema, base version +# +# auto dispatch_abs = [](const Tensor & self) -> Tensor { +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ^ +# +--- dispatch_lambda_args / dispatch_lambda_return_str +# generated from NativeFunction / CppSignature +# (deprecated PythonSignature is special) +# arguments are represented by DispatchLambdaArgument +# +# pybind11::gil_scoped_release no_gil; +# return self.abs(); +# ~~~~~~~~~~~ <--- cpp_dispatch_target / cpp_dispatch_exprs +# generated from NativeFunction / CppSignature +# }; +# return wrap(dispatch_abs(_r.tensor(0))); +# ~~~~~~~~~~~~~ +# ^ +# +--- dispatch_lambda_exprs +# binding PythonArgParserOutputExpr (python args) +# and DispatchLambdaArgument (c++ args) +# +# } else { +# // aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ^ +# +--- NativeFunction schema, out-variant +# +# auto dispatch_abs_out = [](Tensor out, const Tensor & self) -> Tensor { +# pybind11::gil_scoped_release no_gil; +# return at::abs_out(out, self); +# }; +# return wrap(dispatch_abs_out(_r.tensor(1), _r.tensor(0))); +# } +# +# +# [Notes] python interface codegen +# The python dataclasses below are used used to generate both python binding code +# and pyi type hint signatures. +# In theory these two should look very similar, but there are number of differences +# in how pyi signatures vs. python_arg_parser signatures are generated. +# These differences have been encapsulated in signature_str() vs. signature_str_pyi() +# to display the full signatures, and argument_str() vs argument_str_pyi() to display arguments. +# For examples, only pyi signatures include return types. + + +@dataclass(frozen=True) +class PythonReturns: + returns: tuple[Return, ...] + + +@dataclass(frozen=True) +class PythonArgument: + name: str + type: Type + default: str | None + + # Used to generate the default init expr for some PythonArgParser outputs, e.g.: + # + # _r.layoutWithDefault(3, layout_from_backend(self.options().backend()))) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # ^ + # +--- default_init str + default_init: str | None + + # Compute argument formal for python argument parsing. + # Needs to be consistent with torch/csrc/utils/python_arg_parser.h. + def argument_str(self, *, method: bool = False, symint: bool = True) -> str: + type_str = ( + argument_type_str(self.type, symint=symint) + .replace("const ", "") + .replace(" &", "") + ) + + name = self.name + # s/self/input/ outside method bindings + # [old codegen] TODO: remove this? doesn't rename in codegen, it's just + # for the parse string + if name == "self" and type_str in ["Tensor", "Number"] and not method: + name = "input" + + # add default + if self.default is not None: + default = { + "nullptr": "None", + "::std::nullopt": "None", + "std::nullopt": "None", + "{}": "None", + }.get(self.default, self.default) + return f"{type_str} {name}={default}" + else: + return f"{type_str} {name}" + + def argument_str_pyi( + self, *, method: bool = False, deprecated: bool = False + ) -> str: + type_str = argument_type_str_pyi(self.type) + + name = self.name + # s/self/input/ outside method bindings + # [old codegen] TODO: remove this? doesn't rename in codegen, it's just + # for the parse string + if name == "self" and type_str == "Tensor" and not method and not deprecated: + name = "input" + + if name == "from": # from is a Python keyword... + name += "_" + + # pyi merges the _out and functional variants into the same signature, with an optional out arg + if name == "out" and type_str == "Tensor" and not deprecated: + type_str = "Optional[" + type_str + "]" + + # pyi deprecated signatures don't get defaults for their out arg + treat_as_no_default = ( + deprecated + and isinstance(self, PythonOutArgument) + and self.default == "None" + ) + + # add default + if self.default is not None and not treat_as_no_default: + if ( + isinstance(self.type, ListType) + and self.type.elem == BaseType(BaseTy.int) + and self.default.startswith("{") + and self.default.endswith("}") + ): + default = ( + "(" + ", ".join(map(str.strip, self.default[1:-1].split(","))) + ")" + ) + else: + default = { + "nullptr": "None", + "::std::nullopt": "None", + "std::nullopt": "None", + "{}": "None", + "c10::MemoryFormat::Contiguous": "contiguous_format", + "QScheme::PER_TENSOR_AFFINE": "per_tensor_affine", + }.get(self.default, self.default) + return f"{name}: {type_str} = {default}" + else: + return f"{name}: {type_str}" + + +@dataclass(frozen=True) +class PythonOutArgument(PythonArgument): + # In Python signature multiple output fields are packed into one 'out' argument. + # When binding to C++, it's first binded to a local 'out' variable: + # 'auto out = _r.tensorlist_n<2>(2);', + # then binded to scattered C++ output arguments as 'out[0]', 'out[1]', and etc. + # TODO: maybe don't need keep scattered out fields for python signature? + outputs: tuple[PythonArgument, ...] + + @staticmethod + def from_outputs(outputs: tuple[PythonArgument, ...]) -> PythonOutArgument | None: + if not outputs: + return None + + size = len(outputs) + if size == 1: + return PythonOutArgument( + name=outputs[0].name, + type=outputs[0].type, + default="None", + default_init=None, + outputs=outputs, + ) + elif size > 1: + if any(not a.type.is_tensor_like() for a in outputs): + raise RuntimeError(f"Unsupported output type: {outputs}") + return PythonOutArgument( + name="out", + # TODO: shouldn't this be OptionalType[ListType[...]], since it defaults to None? + type=ListType(BaseType(BaseTy.Tensor), size), + default="None", + default_init=None, + outputs=outputs, + ) + raise AssertionError(r"Unexpected PythonOutArgument size") + + +@dataclass(frozen=True) +class PythonSignature: + # Base operator name, without inplace/outplace suffix. + name: str + + # Positional arguments. + # TODO: create a dedicated SelfArgument type for 'self'? + input_args: tuple[PythonArgument, ...] + + # Keyword arguments excluding the 'out' argument and scattered kwargs belonging + # to TensorOptions (dtype, layout, device, pin_memory, requires_grad, etc). + input_kwargs: tuple[PythonArgument, ...] + + output_args: PythonOutArgument | None + + # Return types, which are only used by pyi + returns: PythonReturns + + # These are scattered kwargs arguments belonging to TensorOptions. + # When binding to C++, they are packed into a TensorOptions object 'options'. + # It's possible that the C++ signature doesn't take TensorOptions object (e.g. + # for out variant), in which case they will be used as scattered fields without + # being packed into 'options'. + # TODO: maybe create a PythonTensorOptionsArgument? + tensor_options_args: tuple[PythonArgument, ...] + + # method or function signature? + method: bool + + @property + def deprecated(self) -> bool: + return False + + def arguments( + self, *, skip_outputs: bool = False, skip_tensor_options: bool = False + ) -> tuple[PythonArgument | PythonOutArgument, ...]: + result: list[PythonArgument | PythonOutArgument] = [] + result.extend(self.input_args) + result.extend(self.input_kwargs) + if self.output_args is not None and not skip_outputs: + result.append(self.output_args) + if not skip_tensor_options: + result.extend(self.tensor_options_args) + return tuple(result) + + def arguments_count(self) -> int: + return len(self.arguments()) + + def output_idx(self) -> int: + return len(self.input_args) + len(self.input_kwargs) + + # [old codegen] Compute the Python function signature for argument parsing, + # as specified in torch/csrc/utils/python_arg_parser.h. WARNING: + # this is NOT the same type signature as specified by PEP 484 + # as understood by mypy; our format was independently developed + # and has some quirks to make it more suitable specifically + # for error parsing. + # + # For a translation to mypy-valid type signatures, see + # signature_str_pyi(). + def signature_str(self, *, skip_outputs: bool = False, symint: bool = True) -> str: + args = self.arguments(skip_outputs=skip_outputs) + schema_formals: list[str] = [ + a.argument_str(method=self.method, symint=symint) for a in args + ] + positional_argc = len(self.input_args) + if len(schema_formals) > positional_argc: + schema_formals.insert(positional_argc, "*") + + return f'{self.name}({", ".join(schema_formals)})' + + def signature_str_pyi(self, *, skip_outputs: bool = False) -> str: + args = self.arguments(skip_outputs=skip_outputs) + schema_formals: list[str] = [ + a.argument_str_pyi(method=self.method) for a in args + ] + positional_argc = len(self.input_args) + if len(schema_formals) > positional_argc: + schema_formals.insert(positional_argc, "*") + + # only pyi signatures include returns + returns_str = returns_str_pyi(self) + # pyi also includes self (with no typing/defaults) for methods + if self.method: + schema_formals.insert(0, "self") + return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' + + def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> str | None: + # only pyi uses vararg signatures + args = self.arguments(skip_outputs=skip_outputs) + schema_formals: list[str] = [ + a.argument_str_pyi(method=self.method) for a in args + ] + # vararg only applies to pyi signatures. vararg variants are not generated for all signatures + num_args = self.arguments_count() + num_positionalargs = len(self.input_args) + + have_vararg_version = False + if num_args > 0: + vararg_type = args[0].type + if ( + isinstance(vararg_type, ListType) + and str(vararg_type.elem) in ["int", "SymInt"] + and num_positionalargs == 1 + ): + have_vararg_version = True + + if not have_vararg_version: + return None + + # Below are the major changes in vararg vs. regular pyi signatures + # vararg signatures also omit the asterix + assert isinstance(vararg_type, ListType) + schema_formals[0] = ( + "*" + args[0].name + ": " + argument_type_str_pyi(vararg_type.elem) + ) + + returns_str = returns_str_pyi(self) + # pyi also includes self (with no typing/defaults) for methods + if self.method: + schema_formals.insert(0, "self") + return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' + + +# The deprecated python signature involves some special logic, so create a +# dedicated data model to store these extra properties. +@dataclass(frozen=True) +class PythonSignatureDeprecated(PythonSignature): + # Schema for the deprecated function + deprecated_schema: FunctionSchema + + # The deprecated signature might miss some arguments that the corresponding + # C++ signature expects. We need store the constant default values to pass in. + # For example: + # [deprecate signature]: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) + # [func schema]: aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + # [func call]: self.addmm(mat1, mat2, beta, 1) + # We store ['self', 'mat1', 'mat2', 'beta', '1'] in this case. + deprecated_args_exprs: tuple[str, ...] + + @property + def deprecated(self) -> bool: + return True + + def signature_str(self, *, skip_outputs: bool = False, symint: bool = True) -> str: + return ( + PythonSignature.signature_str( + self, skip_outputs=skip_outputs, symint=symint + ) + + "|deprecated" + ) + + def signature_str_pyi(self, *, skip_outputs: bool = False) -> str: + args = self.arguments(skip_outputs=skip_outputs) + schema_formals: list[str] = [ + a.argument_str_pyi(method=self.method, deprecated=True) for a in args + ] + positional_argc = len(self.input_args) + if len(schema_formals) > positional_argc: + schema_formals.insert(positional_argc, "*") + + returns_str = returns_str_pyi(self) + return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' + + def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> str | None: + # the codegen doesn't include vararg variants for deprecated signatures + return None + + +# This struct is used to hold the PythonSignature and its corresponding +# NativeFunction BEFORE grouping base and out-variant functions. +# Why not store NativeFunction in PythonSignature or construct PythonSignature +# from NativeFunction? Because they are not 1-1 mapped. +# One native function could have both deprecated and non-deprecated python +# signatures - NativeFunction doesn't contain information to construct the +# deprecated python signature. +# One python signature is used to handle both the base and the out-variant +# function - see 'PythonSignatureGroup'. +@dataclass(frozen=True) +class PythonSignatureNativeFunctionPair: + signature: PythonSignature + function: NativeFunction + + +# We merge pairs of functions with signatures that are equivalent mod +# output arguments, and use a single entry in the python_arg_parser sig +# list for both (output arguments become optional). +@dataclass(frozen=True) +class PythonSignatureGroup: + # The signature used for Python argument parsing. The outplace signature + # is preferred if exists, because it can be used to parse inputs for both + # the out-place variant and the base version (with output omitted). + signature: PythonSignature + + # The regular ATen declaration (e.g. conv2d) + base: NativeFunction + + # The out variant (e.g. conv2d_out) + outplace: NativeFunction | None + + @classmethod + def from_pairs( + cls, + functional: PythonSignatureNativeFunctionPair, + out: PythonSignatureNativeFunctionPair | None, + ) -> PythonSignatureGroup: + if out is None: + return PythonSignatureGroup( + signature=functional.signature, + base=functional.function, + outplace=None, + ) + + # prefer the signature with optional out=... arguments because it's the + # superset that can be used to parse input for both base and outplace. + signature_kwargs = out.signature.__dict__.copy() + + # Out overloads in C++ don't have TensorOptions arguments, + # so take these from the functional variant + signature_kwargs[ + "tensor_options_args" + ] = functional.signature.tensor_options_args + + return PythonSignatureGroup( + signature=type(out.signature)(**signature_kwargs), + base=functional.function, + outplace=out.function, + ) + + +# C++ function dispatch is wrapped in a lambda function. The lambda function +# has almost the same signature as the C++ function, only with some small +# variants - see details below. +# This data model is used to represent arguments of the lambda function +# signature. +@dataclass(frozen=True) +class DispatchLambdaArgument: + name: str + type_str: str + is_out_arg: bool + + +# To pass PyObjects arguments to C++ function (via the lambda wrapper), +# we need first convert PyObjects into simple C++ objects. This work +# is done by PythonArgParser. +# This data model is used to represent the output of PythonArgParser. +# It has 1-1 mapping with PythonArgument in PythonSignature. +@dataclass(frozen=True) +class PythonArgParserOutputExpr: + # argument name + name: str + + # RHS expression to reference PythonArgParser output. + expr: str + + # In some special cases we need create different expr, e.g.: + # '_r.isNone(1)' instead of '_r.tensor(1)'. + index: int + + # The python argument it maps to. + argument: PythonArgument + + @property + def is_none_expr(self) -> str: + return f"_r.isNone({self.index})" + + +# To pass PythonArgParser output to the lambda wrapper, we need bind +# PythonArgParserOutputExpr to DispatchLambdaArgument. +# They are not always 1-1 mapped, e.g. scattered TensorOptions fields +# need be packed into a TensorOptions object, which is the argument +# that the lambda function wrapper takes. +@dataclass(frozen=True) +class DispatchLambdaArgumentExprs: + # The exprs that provide the binding for lambda arguments, e.g.: + # + # 'self' -> '_r.tensor(0)' + # 'min' -> 'out[0]' / 'min_indices' -> 'out[1]' + # 'options' -> 'options' + # + # It has 1-1 mapping with DispatchLambdaArgument. + exprs: Sequence[str] + + # Special local inits, which might introduce new variables that + # the 'exprs' above reference, e.g.: + # + # 'auto out = _r.tensorlist_n<2>(2);' + # + inits: Sequence[str] + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Helper Functions +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature: + return CppSignatureGroup.from_native_function(f, method=method).signature + + +def has_tensor_options(f: NativeFunction) -> bool: + return f.func.arguments.tensor_options is not None + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Python Signature +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +# 'simple_type' was introduced by the old codegen, which is slightly +# different from the python schema type, e.g.: doesn't have '?' suffix +# for optional Tensor/TensorList; doesn't have '[size]' suffix for list type. +def argument_type_str( + t: Type, *, simple_type: bool = False, symint: bool = True +) -> str: + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + return "Tensor" + elif t.name == BaseTy.int: + return "int64_t" + elif t.name == BaseTy.float: + return "double" + elif t.name == BaseTy.str: + return "c10::string_view" + elif t.name in [ + BaseTy.bool, + BaseTy.QScheme, + BaseTy.Scalar, + BaseTy.ScalarType, + BaseTy.Generator, + BaseTy.Storage, + BaseTy.Layout, + BaseTy.Device, + BaseTy.DeviceIndex, + BaseTy.MemoryFormat, + BaseTy.Dimname, + BaseTy.Stream, + BaseTy.ConstQuantizerPtr, + BaseTy.SymInt, + ]: + # These python schema type names line up with their function schema names + return t.name.name + + elif isinstance(t, OptionalType): + if str(t.elem) == "Tensor": + # Is it desired to keep '?' for simple_type with new style dispatcher? + return "Tensor?" + elem = argument_type_str(t.elem, simple_type=simple_type, symint=symint) + return f"{elem}?" + elif isinstance(t, ListType): + size = t.size if not simple_type else None + if str(t.elem) == "bool": + assert t.size is not None + return f"::std::array" + elif str(t.elem) == "int": + return f"IntArrayRef[{size}]" if size is not None else "IntArrayRef" + elif str(t.elem) == "SymInt": + if symint: + return ( + f"SymIntArrayRef[{size}]" if size is not None else "SymIntArrayRef" + ) + else: + return f"IntArrayRef[{size}]" if size is not None else "IntArrayRef" + elif str(t.elem) == "Tensor": + return f"TensorList[{size}]" if size is not None else "TensorList" + elif str(t.elem) == "Scalar": + return f"ScalarList[{size}]" if size is not None else "ScalarList" + elif str(t.elem) == "Tensor?": + if simple_type: + return "c10::List<::std::optional>" + else: + return "const c10::List<::std::optional> &" + elif str(t.elem) == "Dimname": + return f"DimnameList[{size}]" if size is not None else "DimnameList" + elem = argument_type_str(t.elem, simple_type=simple_type, symint=symint) + return f"ArrayRef<{elem}>" + + raise RuntimeError(f"unrecognized type {repr(t)}") + + +def argument_type_size(t: Type) -> int | None: + l = t.is_list_like() + if l is not None and str(l.elem) != "bool": + return l.size + else: + return None + + +def argument(a: Argument) -> PythonArgument: + return PythonArgument( + name=a.name, + type=a.type, + # TODO: directly translate a.default to python default + default=( + str(pythonify_default(cpp.default_expr(a.default, a.type, symint=False))) + if a.default is not None + else None + ), + default_init=None, + ) + + +# Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen +def signature( + f: NativeFunction, *, method: bool = False, pyi: bool = False +) -> PythonSignature: + return signature_from_schema( + f.func, category_override=f.category_override, method=method, pyi=pyi + ) + + +def signature_from_schema( + func: FunctionSchema, + *, + category_override: str | None, + method: bool = False, + pyi: bool = False, +) -> PythonSignature: + args: list[Argument] = [] + args.extend(func.arguments.pre_self_positional) + # Skip SelfArgument if this is method. + if not method and func.arguments.self_arg is not None: + args.append(func.arguments.self_arg.argument) + args.extend(func.arguments.post_self_positional) + args.extend(func.arguments.pre_tensor_options_kwarg_only) + # Skip TensorOptionsArguments. Python side TensorOptions + # arguments are created based on different rules - see below. + args.extend(func.arguments.post_tensor_options_kwarg_only) + args.extend(func.arguments.out) + + input_arg_set = {a.name for a in func.arguments.flat_positional} + kwarg_only_set = {a.name for a in func.arguments.flat_kwarg_only} + out_arg_set = {a.name for a in func.arguments.out} + + input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args))) + input_kwargs = tuple( + map(argument, filter(lambda a: a.name in kwarg_only_set, args)) + ) + outputs = tuple(map(argument, filter(lambda a: a.name in out_arg_set, args))) + + # Reintroduce the scattered fields of TensorOptions for Python. + # Compared to the cpp counterpart, the python arguments have new property + # (default_init) and a new argument 'requires_grad', which require some + # special handlings. + # [old codegen] TODO: because these aren't guaranteed to be 100% faithful + # to the original versions in the yaml, this recreation is a potential + # source of drift between eager and JIT. Pull this logic out to a shared place. + + has_tensor_input_arg = any( + a.type.is_tensor_like() for a in func.arguments.flat_non_out + ) + if any(a.name == "requires_grad" for a in func.schema_order_arguments()): + raise ValueError( + "argument named requires_grad is reserved, should not explicitly add it in the schema" + ) + + # [old codegen] this probably won't work if one of the returns is not a tensor, + # but it will produce a compile-time error that is obvious. + has_tensor_return = any(r.type.is_tensor_like() for r in func.returns) + + name: str = cpp.name(func) + is_factory_function = category_override == "factory" or ( + has_tensor_return and not has_tensor_input_arg + ) + is_like_or_new_function = ( + category_override in ("new", "like") + or name.startswith("new_") + or name.endswith("_like") + ) + is_dummy_function = category_override == "dummy" + + tensor_options_args: list[PythonArgument] = [] + if (is_factory_function or is_like_or_new_function) and not is_dummy_function: + + def topt_default_init(name: str) -> str | None: + topt_args = func.arguments.tensor_options + if topt_args is None: + return None + a = getattr(topt_args, name) + if a.default is None or a.default == "None": + return None + return cpp.default_expr(a.default, a.type, symint=False) + + tensor_options_args.append( + PythonArgument( + name="dtype", + type=OptionalType(BaseType(BaseTy.ScalarType)), + default="None", + default_init=( + None if is_like_or_new_function else topt_default_init("dtype") + ), + ) + ) + tensor_options_args.append( + PythonArgument( + name="layout", + type=OptionalType(BaseType(BaseTy.Layout)), + default="None", + default_init=( + None if is_like_or_new_function else topt_default_init("layout") + ), + ) + ) + tensor_options_args.append( + PythonArgument( + name="device", + type=OptionalType(BaseType(BaseTy.Device)), + default="None", + default_init=( + None + if is_like_or_new_function + else ( + topt_default_init("device") + or "torch::tensors::get_default_device()" + ) + ), + ) + ) + tensor_options_args.append( + PythonArgument( + name="pin_memory", + type=OptionalType(BaseType(BaseTy.bool)), + default="False", + default_init=None, + ) + ) + tensor_options_args.append( + PythonArgument( + name="requires_grad", + type=OptionalType(BaseType(BaseTy.bool)), + default="False", + default_init=None, + ) + ) + + returns = PythonReturns(returns=func.returns) + + return PythonSignature( + name=str(func.name.name), + input_args=input_args, + input_kwargs=input_kwargs, + output_args=PythonOutArgument.from_outputs(outputs), + tensor_options_args=tuple(tensor_options_args), + returns=returns, + method=method, + ) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Python Interface +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +def structseq_fieldnames(returns: tuple[Return, ...]) -> list[str]: + if len(returns) <= 1 or all(r.name is None for r in returns): + return [] + else: + if any(r.name is None for r in returns): + # When building on Windows, `PyStructSequence_UnnamedField` could not be + # resolved by the linker for some reason, which cause error in building: + # + # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol + # PyStructSequence_UnnamedField + # + # Thus, at this point in time, we do not support unnamed + # fields in structseq; you must either name all fields, + # or none of them. + raise ValueError("Unnamed field is not supported by codegen") + + return [str(r.name) for r in returns] + + +def argument_type_str_pyi(t: Type) -> str: + add_optional = False + if isinstance(t, OptionalType): + t = t.elem + add_optional = True + + if isinstance(t, BaseType): + if t.name in [BaseTy.int, BaseTy.DeviceIndex]: + ret = "_int" + if t.name == BaseTy.SymInt: + ret = "Union[_int, SymInt]" + elif t.name == BaseTy.float: + ret = "_float" + elif t.name == BaseTy.str: + ret = "str" + elif t.name == BaseTy.Scalar: + ret = "Union[Number, _complex]" + elif t.name == BaseTy.ScalarType: + ret = "_dtype" + elif t.name == BaseTy.bool: + ret = "_bool" + elif t.name == BaseTy.QScheme: + ret = "_qscheme" + elif t.name == BaseTy.Layout: + ret = "_layout" + elif t.name == BaseTy.Device: + ret = "Optional[DeviceLikeType]" + elif t.name == BaseTy.MemoryFormat: + ret = "memory_format" + elif t.name == BaseTy.Dimname: + ret = "Union[str, ellipsis, None]" + elif t.name == BaseTy.Storage: + ret = "Union[Storage, UntypedStorage]" + elif t.name in [BaseTy.Tensor, BaseTy.Generator, BaseTy.Stream]: + # These python schema type names line up with their function schema names + ret = t.name.name + + elif isinstance(t, ListType): + if str(t.elem) == "int": + ret = "Union[_int, _size]" if t.size is not None else "_size" + elif t.is_tensor_like(): + # TODO: this doesn't seem right... + # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]] + # It should probably translate to Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]] + if isinstance(t.elem, OptionalType): + add_optional = True + ret = ( + "Union[Tensor, Tuple[Tensor, ...], List[Tensor]]" + if t.size is not None + else "Union[Tuple[Tensor, ...], List[Tensor]]" + ) + elif str(t.elem) == "float": + ret = "Sequence[_float]" + elif str(t.elem) == "SymInt" and t.size is not None: + elem = argument_type_str_pyi(t.elem) + ret = f"Union[{elem}, Sequence[{elem}]]" + else: + elem = argument_type_str_pyi(t.elem) + ret = f"Sequence[{elem}]" + + else: + raise RuntimeError(f"unrecognized type {repr(t)}") + + if add_optional: + ret = "Optional[" + ret + "]" + + return ret + + +def return_type_str_pyi(t: Type) -> str: + # Where arguments are open to accepting Union, return types should return + # concrete types + + if isinstance(t, OptionalType): + inner = return_type_str_pyi(t.elem) + return f"Optional[{inner}]" + + if isinstance(t, BaseType): + if t.name == BaseTy.Device: + return "_device" + elif t.name == BaseTy.Dimname: + ret = "Optional[str]" + else: + return argument_type_str_pyi(t) + + if isinstance(t, ListType): + inner = return_type_str_pyi(t.elem) + return f"Tuple[{inner}, ...]" + + return argument_type_str_pyi(t) + + +def returns_structseq_pyi(signature: PythonSignature) -> tuple[str, str] | None: + python_returns = [return_type_str_pyi(r.type) for r in signature.returns.returns] + structseq_name = signature.name + field_names = structseq_fieldnames(signature.returns.returns) + if field_names: + # These types are structseq objects which act like named NamedTuples, but + # the constructor acts like the constructor of tuple. Using typing.NamedTuple + # does not allow us to override __init__. + seq_type = f"Tuple[{', '.join(python_returns)}]" + structseq_def_lines = [ + f"class {structseq_name}({seq_type}):", + ] + for name, typ in zip(field_names, python_returns): + structseq_def_lines.extend( + [ + " @property", + f" def {name}(self) -> {typ}: ...", + ] + ) + structseq_def_lines.extend( + [ + f" def __new__(cls, sequence: {seq_type}): ...", + f" n_fields: _int = {len(field_names)}", + f" n_sequeunce_fields: _int = {len(field_names)}", + " n_unnamed_fields: _int = 0", + " def __init_subclass__(cls) -> NoReturn: ... # prohibit subclassing", + "", # add an extra newline + ] + ) + structseq_def = "\n".join(structseq_def_lines) + # Example: + # structseq_def = ( + # "class max(Tuple[Tensor, Tensor]):\n" + # " @property\n" + # " def values(self) -> Tensor: ...\n" + # " @property\n" + # " def indices(self) -> Tensor: ...\n" + # " def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...\n" + # " n_fields: _int = 2", + # " n_sequeunce_fields: _int = 2", + # " n_unnamed_fields: _int = 0", + # " def __init_subclass__(cls) -> NoReturn: ... # prohibit subclassing", + # ) + return structseq_name, structseq_def + return None + + +def returns_str_pyi(signature: PythonSignature) -> str: + field_names = structseq_fieldnames(signature.returns.returns) + if field_names: + return f"torch.return_types.{signature.name}" + + python_returns = [return_type_str_pyi(r.type) for r in signature.returns.returns] + if len(python_returns) > 1: + return "Tuple[" + ", ".join(python_returns) + "]" + if len(python_returns) == 1: + return python_returns[0] + return "None" + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# C++ Function Dispatch +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# This section provides APIs to generate the code that does C++ function +# dispatch. The C++ function call is wrapped by a lambda function. +# For example: +# +# // aten::selu_(Tensor(a!) self) -> Tensor(a!) +# auto dispatch_selu_ = [](Tensor self) -> Tensor { +# pybind11::gil_scoped_release no_gil; +# return at::selu_(self); +# }; +# +# The lambda function's signature follows the C++ signature in common +# cases, e.g.: +# +# // aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor +# [](const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor +# +# For out variant the 'out' argument's type is changed from 'Tensor &' +# to 'Tensor'. It's because when calling the lambda it passes in the +# PythonArgParser output '_r.tensor(3)', which is stack allocated object +# and needs to pass by value. Also see comments in 'dispatch_lambda_return_str()'. +# +# // aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) +# [](Tensor out, const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor +# +# For multi-output case it can keep using reference type because the +# PythonArgParser output has been unpacked to local variables, e.g.: +# +# // aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, +# // Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) +# [](Tensor & max, Tensor & max_values, const Tensor & self, Dimname dim, bool keepdim) -> std::tuple +# +# For deprecated python signature, it should follow deprecated python arg order. +# TODO: This is to keep same byte-for-byte result as the old codegen - maybe unnecessary? + + +def dispatch_lambda_args( + ps: PythonSignature, f: NativeFunction, symint: bool = True +) -> tuple[DispatchLambdaArgument, ...]: + if isinstance(ps, PythonSignatureDeprecated): + schema = ps.deprecated_schema + else: + schema = f.func + + # Start with cpp arguments - dispatch lambda signature always include 'self' + cpp_args = cpp.arguments( + arguments=schema.arguments, + faithful=False, + symint=symint, + method=False, + cpp_no_default_args=f.cpp_no_default_args, + ) + out_args: set[str] = {a.name for a in schema.arguments.out} + + # Convert from cpp argument to lambda argument + def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument: + type_str = cpp_arg.type + is_out_arg = cpp_arg.name in out_args + if ps.method and cpp_arg.name == "self": + # For method's 'self', we can use 'const Tensor &' and simply ignore mutability! + type_str = "const at::Tensor &" + else: + # For other cases we need prevent dangling refs to temps (unless it's + # unpacked scattered output) + # The reason is explained in the comments above and in 'dispatch_lambda_return_str()'. + # TODO: avoid this special handling? + ensure_temp_safe = len(out_args) <= 1 or not is_out_arg + if ensure_temp_safe: + type_str = { + "at::Tensor &": "at::Tensor", + }.get(type_str, type_str) + return DispatchLambdaArgument( + name=cpp_arg.name, + type_str=type_str, + is_out_arg=is_out_arg, + ) + + return tuple(map(dispatch_lambda_arg, cpp_args)) + + +# [old codegen] XXX: if you got here because of an assertion failure, it doesn't mean +# it's enough to just extend the list here. Before you do this, make sure +# to add an appropriate wrap() overload in torch/csrc/autograd/utils/wrap_outputs.h. +SUPPORTED_RETURN_TYPES = { + "at::Tensor", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple", + "::std::tuple>", + "::std::vector", + # Needed for flash attention forw/backward + "::std::tuple", + "at::Scalar", + "bool", + "int64_t", + "void*", + "void", + "at::QScheme", + "double", + "at::IntArrayRef", + "at::ScalarType", + "at::Stream", +} + + +def dispatch_lambda_return_str(f: NativeFunction) -> str: + # [old codegen] Remove type annotation (e.g. 'Tensor' rather than 'Tensor &') + # because the dispatch lambdas take mutable arguments *by value*, not + # by reference. If you then return a reference to such an argument, you + # will now have a pointer to a dangling stack entry. Not good. + # + # You want: + # + # auto dispatch_selu_ = [](Tensor self) -> Tensor { ...; return at::selu_(self); }; + # ^^^^^^ + # + # *not* + # + # auto dispatch_selu_ = [](Tensor self) -> Tensor& { ...; return at::selu_(self); }; + # ^^^^^^^ + # + # (NB: We can't make dispatch_selu_ take Tensor&, because the enclosing + # codegen looks like dispatch_selu_(_r.tensor(0)), and you can't take a + # mutable reference to temporary. Maybe we could assign it to a + # variable itself.) + returns_without_annotation = tuple( + Return(r.name, r.type, None) for r in f.func.returns + ) + return_str = cpp.returns_type(returns_without_annotation, symint=True).cpp_type() + if return_str not in SUPPORTED_RETURN_TYPES: + raise RuntimeError(f"{f.func.name} returns unsupported type {return_str}") + return return_str + + +def cpp_dispatch_target(f: NativeFunction) -> str: + symint = f.func.has_symint() + name = cpp.name(f.func, symint_overload=symint) + if Variant.method in f.variants: + return f"self.{name}" + if Variant.function in f.variants: + if has_tensor_options(f) or f.func.name.name.base.endswith("_like"): + namespace = "torch" + else: + namespace = "at" + return f"{namespace}::{name}" + raise RuntimeError(f"could not dispatch, neither function nor method: {f.func}") + + +def cpp_dispatch_exprs( + f: NativeFunction, + *, + python_signature: PythonSignature | None = None, +) -> tuple[str, ...]: + cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments() + + exprs: tuple[str, ...] = () + if not isinstance(python_signature, PythonSignatureDeprecated): + # By default the exprs are consistent with the C++ signature. + exprs = tuple(a.name for a in cpp_args) + else: + # For deprecated python signature we may need fill in some constants. + exprs = tuple( + filter( + lambda n: n != "out" or f.func.is_out_fn(), + python_signature.deprecated_args_exprs, + ) + ) + + if Variant.method in f.variants: + exprs = tuple(filter("self".__ne__, exprs)) + + return exprs + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Python / C++ Args Binding +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +# We explicitly enumerate the PythonArgParser unpacking methods for all +# supported types. This might be more verbose than necessary, partially +# because of the irregularity of unpacking method naming, partially +# because we want to mimic the old codegen behavior - to reject +# unexpected and/or unsupported cases which the old codegen rejects. +# For certain cases it is intentionally more restrictive than necessary, +# e.g.: it doesn't accepts doublelist with definite size. +def arg_parser_unpack_method( + t: Type, default: str | None, default_init: str | None, *, symint: bool = True +) -> str: + has_default_init = default_init is not None + if has_default_init and str(t) not in ( + "ScalarType?", + "ScalarType", + "Device", + "Device?", + "Layout", + "Layout?", + "bool", + "bool?", + ): + raise RuntimeError(f"type '{t}' does not supported unpacking with default") + + if isinstance(t, BaseType): + if t.name in [ + BaseTy.Tensor, + BaseTy.Stream, + BaseTy.Storage, + BaseTy.Scalar, + BaseTy.Dimname, + ]: + # These unpack methods line up with their schema names + return t.name.name.lower() + elif t.name == BaseTy.ScalarType: + return "scalartypeWithDefault" if has_default_init else "scalartype" + elif t.name == BaseTy.Device: + return "deviceWithDefault" if has_default_init else "device" + elif t.name == BaseTy.DeviceIndex: + return "toInt64" + elif t.name == BaseTy.int: + return "toInt64" + elif t.name == BaseTy.SymInt: + return "toSymInt" if symint else "toInt64" + elif t.name == BaseTy.bool: + return "toBoolWithDefault" if has_default_init else "toBool" + elif t.name == BaseTy.float: + return "toDouble" + elif t.name == BaseTy.str: + return "stringView" + elif t.name == BaseTy.Layout: + return "layoutWithDefault" if has_default_init else "layout" + elif t.name == BaseTy.MemoryFormat: + return "memoryformat" + + elif isinstance(t, OptionalType): + if str(t.elem) == "Tensor": + return "optionalTensor" + elif str(t.elem) == "Generator": + return "generator" + elif str(t.elem) == "Dimname[]": + return "toDimnameListOptional" + elif not has_default_init and default in ( + None, + "None", + "::std::nullopt", + "std::nullopt", + ): + # If default is None: append 'Optional' to elem's unpacking method + return ( + arg_parser_unpack_method(t.elem, None, None, symint=symint) + "Optional" + ) + else: + # Otherwise, load as underlying type with default + return arg_parser_unpack_method( + t.elem, default, default_init, symint=symint + ) + + elif isinstance(t, ListType): + if str(t.elem) == "Tensor": + # accept and use definite size + return f"tensorlist_n<{t.size}>" if t.size is not None else "tensorlist" + elif str(t.elem) == "Tensor?": + return "list_of_optional_tensors" + elif str(t.elem) == "Dimname": + # accept definite size + return "dimnamelist" + elif str(t.elem) == "int": + # accept definite size + return "intlist" + elif str(t.elem) == "float": + return "doublelist" + elif str(t.elem) == "SymInt": + # accept definite size + return "symintlist" if symint else "intlist" + elif str(t.elem) == "Scalar": + return "scalarlist" + raise RuntimeError(f"type '{t}' is not supported by PythonArgParser") + + +# Return RHS expression for python argument using PythonArgParser output. +# e.g. for arg name 'foo', arg type 'bool', arg_index = 2, returns '_r.toBool(2)' +def arg_parser_output_expr( + arg_index: int, a: PythonArgument, *, symint: bool = True +) -> PythonArgParserOutputExpr: + has_default = a.default_init is not None + unpack_method = arg_parser_unpack_method( + t=a.type, default=a.default, default_init=a.default_init, symint=symint + ) + default = f", {a.default_init}" if has_default else "" + expr = f"_r.{unpack_method}({arg_index}{default})" + + return PythonArgParserOutputExpr( + name=a.name, + expr=expr, + index=arg_index, + argument=a, + ) + + +# Returns a map with key = arg_name and value = PythonArgParserOutputExpr. +def arg_parser_output_exprs( + ps: PythonSignature, f: NativeFunction, *, symint: bool = True +) -> dict[str, PythonArgParserOutputExpr]: + return { + e.name: e + for i, a in enumerate(ps.arguments()) + for e in (arg_parser_output_expr(i, a, symint=symint),) + } + + +# argument name to type for scattered tensor options fields +TENSOR_OPTIONS_FIELDS = { + "dtype": "ScalarType?", + "device": "Device?", + "layout": "Layout?", + "pin_memory": "bool?", + "requires_grad": "bool?", +} + + +# bind arg parser outputs (python args) with dispatch lambda arguments (c++ args). +def dispatch_lambda_exprs( + ps: PythonSignature, f: NativeFunction, *, symint: bool = True +) -> DispatchLambdaArgumentExprs: + # This method is to bind 'arg_parser_outputs' and 'lambda_args' by producing + # 'inits' and 'lambda_args_exprs' for each lambda argument using arg parser + # outputs. + arg_parser_outputs = arg_parser_output_exprs(ps, f, symint=symint) + lambda_args = dispatch_lambda_args(ps, f, symint=symint) + inits: list[str] = [] + lambda_args_exprs: dict[str, str] = {} + + has_toptions = has_tensor_options(f) + + # 1. special inits/unpacking to provide binding exprs for lambda arguments. + for a in ps.arguments(skip_tensor_options=True): + name = a.name + arg_parser_expr = arg_parser_outputs[a.name].expr + + if has_toptions and name == "self": + # TODO: why this needs to be special case? + inits.extend( + [ + f"auto self = {arg_parser_expr};", + ] + ) + lambda_args_exprs[name] = name + elif ( + isinstance(a, PythonOutArgument) + and len(a.outputs) > 1 + and f.func.is_out_fn() + ): + inits.extend( + [ + f"auto out = {arg_parser_expr};", + ] + ) + for i, out_arg in enumerate(a.outputs): + lambda_args_exprs[out_arg.name] = f"out[{i}]" + elif str(a.type) == "Dimname[]?": + # [old codegen] + # TODO: make this part of something more general, or get rid of it. + # optional> are special. The PythonArgParser returns an + # optional>, which cannot be implicitly converted to + # optional>. One needs to unwrap the optional and rewrap. + inits.extend( + [ + f"auto __{name} = {arg_parser_expr};", + f"::std::optional {name} = __{name} ? ::std::make_optional(DimnameList(__{name}.value())) : ::std::nullopt;", # noqa: B950 + ] + ) + lambda_args_exprs[name] = name + else: + # default case - directly using PythonArgParser output expr + lambda_args_exprs[name] = arg_parser_expr + + # method's self is passed directly to python binding, rather than parsed + if ps.method: + lambda_args_exprs["self"] = "self" + + # 2. special packing/checking for TensorOptions. + tensor_options_args_names = [a.name for a in ps.tensor_options_args] + if has_toptions: + if f.func.is_out_fn(): + raise RuntimeError(f"{f.func}: tensor options with output arg") + for a in ps.tensor_options_args: + if a.name not in TENSOR_OPTIONS_FIELDS: + raise RuntimeError( + f"{f.func}: unrecognized tensor options field '{a.name}' in python binding arguments" + ) + if str(a.type) != TENSOR_OPTIONS_FIELDS.get(a.name): + raise RuntimeError( + f"{f.func}: unrecognized type '{str(a.type)}' for tensor options field '{a.name}'" + ) + if not all(a in tensor_options_args_names for a in TENSOR_OPTIONS_FIELDS): + raise RuntimeError( + f"{f.func}: incomplete tensor options args: {tensor_options_args_names}" + ) + + inits.append( + f"""\ +const auto options = TensorOptions() + .dtype({arg_parser_outputs['dtype'].expr}) + .device({arg_parser_outputs['device'].expr}) + .layout({arg_parser_outputs['layout'].expr}) + .requires_grad({arg_parser_outputs['requires_grad'].expr}) + .pinned_memory({arg_parser_outputs['pin_memory'].expr}); +torch::utils::maybe_initialize_device(options); +""" + ) + lambda_args_exprs["options"] = "options" + + # 3. special case - access scattered TensorOptions fields without packing + # TODO: maybe move to the generator side as it's not related to binding. + if not has_toptions and tensor_options_args_names: + if "dtype" in tensor_options_args_names: + # we're an output-arg variant, check these args against output tensor + if not f.func.is_out_fn(): + raise RuntimeError( + f"{f.func}: dtype in tensor_options_args without output arg, {ps} {ps.arguments}" + ) + if not all(a in tensor_options_args_names for a in ("layout", "device")): + raise RuntimeError( + f"{f.func}: incomplete tensor options for output check" + ) + + inits.append( + f"""\ +check_out_type_matches({arg_parser_outputs['out'].expr}, {arg_parser_outputs['dtype'].expr}, + {arg_parser_outputs['dtype'].is_none_expr}, {arg_parser_outputs['layout'].expr}, + {arg_parser_outputs['device'].expr}, {arg_parser_outputs['device'].is_none_expr}); +""" + ) + # we'll set requires_grad on outgoing tensor + if "requires_grad" not in tensor_options_args_names: + raise RuntimeError( + f'{f.func}: expected "requires_grad" in tensor_options_args absent, but found [{tensor_options_args_names}]' + ) + + return DispatchLambdaArgumentExprs( + exprs=tuple(lambda_args_exprs[a.name] for a in lambda_args), + inits=inits, + ) diff --git a/torchgen/api/structured.py b/torchgen/api/structured.py new file mode 100644 index 00000000000..93a72eb2b4a --- /dev/null +++ b/torchgen/api/structured.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +from torchgen.api import cpp +from torchgen.api.types import ( + ArgName, + ArrayRefCType, + BaseCType, + Binding, + ConstRefCType, + dimnameListT, + intArrayRefT, + iOptTensorListRefT, + iTensorListRefT, + NamedCType, + OptionalCType, + optionalIntArrayRefT, + optionalScalarRefT, + optionalTensorRefT, + scalarT, + tensorT, +) +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + ListType, + NativeFunctionsGroup, + OptionalType, + SelfArgument, + TensorOptionsArguments, + Type, +) +from torchgen.utils import assert_never + + +# This file describes the translation of JIT schema to the structured functions API. +# This is similar to native API, but a number of historical problems with native +# API have been fixed. + + +# Translation of types occurring in JIT arguments to a C++ argument type. +# NB: For now, mutable doesn't do anything; but it could if we make +# some more nominal types +def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> NamedCType: + # If it's a value type, do the value type translation + # NB: structured kernels ALWAYS have symint off, since they involve actual + # kernels that require real ints. The one exception is the + # CompositeExplicitAutograd and the meta function (which could + # hypothetically be SymInt), but for simplicity we plan for these to just + # be handled in Python + r = cpp.valuetype_type(t, symint=False, binds=binds, mutable=mutable) + if r is not None: + return r + + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + return NamedCType(binds, ConstRefCType(BaseCType(tensorT))) + elif t.name == BaseTy.Scalar: + return NamedCType(binds, ConstRefCType(BaseCType(scalarT))) + else: + raise AssertionError(f"base type should have been value type {t}") + elif isinstance(t, OptionalType): + if t.elem == BaseType(BaseTy.Tensor): + return NamedCType(binds, BaseCType(optionalTensorRefT)) + elif t.elem == BaseType(BaseTy.Scalar): + return NamedCType(binds, BaseCType(optionalScalarRefT)) + elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int": + return NamedCType(binds, BaseCType(optionalIntArrayRefT)) + elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) + return NamedCType(binds, OptionalCType(elem.type)) + elif isinstance(t, ListType): + if t.elem == BaseType(BaseTy.Tensor): + return NamedCType(binds, ConstRefCType(BaseCType(iTensorListRefT))) + elif t.elem == OptionalType(BaseType(BaseTy.Tensor)): + return NamedCType(binds, BaseCType(iOptTensorListRefT)) + # TODO: delete these special cases; see torchgen.api.cpp--these + # must be changed in tandem, but there are problems; see + # https://github.com/pytorch/pytorch/pull/51485 + elif str(t.elem) == "int": + return NamedCType(binds, BaseCType(intArrayRefT)) + elif str(t.elem) == "Dimname": + return NamedCType(binds, BaseCType(dimnameListT)) + elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) + return NamedCType(binds, ArrayRefCType(elem.type)) + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +def argument_type(a: Argument, *, binds: ArgName) -> NamedCType: + return argumenttype_type(a.type, mutable=a.is_write, binds=binds) + + +# returns_type intentionally omitted, because structured kernels never "return"; +# instead, they always indirectly report their outputs (in the case of a meta +# function, by calling set_output; in the case of an impl function, by writing +# directly into the provided out argument). + + +# Structured kernels are never defaulted +def argument(a: Argument | SelfArgument | TensorOptionsArguments) -> list[Binding]: + if isinstance(a, Argument): + return [ + Binding( + nctype=argument_type(a, binds=a.name), + name=a.name, + default=None, + argument=a, + ) + ] + elif isinstance(a, SelfArgument): + return argument(a.argument) + elif isinstance(a, TensorOptionsArguments): + raise AssertionError("structured kernels don't support TensorOptions yet") + else: + assert_never(a) + + +def impl_arguments(g: NativeFunctionsGroup) -> list[Binding]: + args: list[Argument | TensorOptionsArguments | SelfArgument] = [] + + if g.out.precomputed: + # A list of parameters for the impl function with + # certain parameters replaced with precomputed counterparts + # as specified in native_functions.yaml. + non_out_args_replaced: list[ + Argument | TensorOptionsArguments | SelfArgument + ] = [] + for a in g.out.func.arguments.non_out: + if isinstance(a, Argument) and a.name in g.out.precomputed.replace: + # If a is in precompute.replace, append the parameters + # that should replace it onto non_out_args_replaced. + non_out_args_replaced.extend(g.out.precomputed.replace[a.name]) + else: + # If not, push a as it is. + non_out_args_replaced.append(a) + + args.extend(non_out_args_replaced) + # g.out.precomputed.add is the list of parameters that are added + # without replacement after the non out args and just before the out args + args.extend(g.out.precomputed.add) + else: + args.extend(g.out.func.arguments.non_out) + + args.extend(g.out.func.arguments.out) + return [r for arg in args for r in argument(arg)] + + +def meta_arguments(g: NativeFunctionsGroup) -> list[Binding]: + args: list[Argument | TensorOptionsArguments | SelfArgument] = [] + args.extend(g.functional.func.arguments.non_out) + return [r for arg in args for r in argument(arg)] + + +def out_arguments(g: NativeFunctionsGroup) -> list[Binding]: + args: list[Argument | TensorOptionsArguments | SelfArgument] = [] + args.extend(g.out.func.arguments.out) + return [r for arg in args for r in argument(arg)] diff --git a/torchgen/api/translate.py b/torchgen/api/translate.py new file mode 100644 index 00000000000..761fb3c7c2b --- /dev/null +++ b/torchgen/api/translate.py @@ -0,0 +1,433 @@ +from __future__ import annotations + +from typing import NoReturn, Sequence + +from torchgen.api.types import ( + ArrayRefCType, + BaseCType, + Binding, + boolT, + ConstRefCType, + deviceT, + Expr, + intArrayRefT, + iOptTensorListRefT, + layoutT, + ListCType, + longT, + memoryFormatT, + MutRefCType, + NamedCType, + opmath_t, + OptionalCType, + optionalIntArrayRefT, + optionalScalarRefT, + optionalSymIntArrayRefT, + optionalTensorRefT, + scalar_t, + scalarT, + scalarTypeT, + SpecialArgName, + symIntArrayRefT, + SymIntT, + tensorOptionsT, + tensorT, + VectorCType, +) + + +# This file implements a small program synthesis engine that implements +# conversions between one API to another. +# +# The key data type in this file in NamedCType, short for Named C++ semantic type. A NamedCType +# represents a C++ type, plus semantic information about what it represents. +# For example, consider the argument "bool pin_memory"; its normal C++ type is +# "bool", but its C++ semantic type also keeps track that this represents a +# "pin_memory"; you can't just use a random other boolean in a context where you +# need a "pin_memory"! +# +# The translator takes a list of needed NamedCTypes, and then figures out how +# to construct expressions with these NamedCTypes from the given bindings. Many +# of these expressions are trivial (I need a Tensor other; there's a Tensor +# other scope); others are more nontrivial and may require packing/unpacking. +# Some examples of non-trivial action: +# +# - Need the "dtype" binding? Well, maybe "dtype" isn't available +# in the context, instead, "options" is, and you need to extract +# it from there. (Gather) +# +# - Need the "context" binding? Well, maybe "context" isn't available +# in the context, and you need to construct it from "dtype", "device", +# etc. (Scatter) +# +# - Need the "memory_format" binding? Well, actually, it's available +# from both "memory_format" and "options", so you had better make sure +# they are consistent. (Join) + +options_ctype = NamedCType("options", ConstRefCType(BaseCType(tensorOptionsT))) + +out_tensor_ctype = NamedCType("out", ConstRefCType(BaseCType(tensorT))) + +longVec_ctype = VectorCType(BaseCType(longT)) +longSymVec_ctype = VectorCType(BaseCType(SymIntT)) +optionalLongVec_ctype = OptionalCType(VectorCType(BaseCType(longT))) +optionalScalar_ctype = OptionalCType(BaseCType(scalarT)) +optionalTensor_ctype = OptionalCType(BaseCType(tensorT)) + + +class UnsatError(RuntimeError): + pass + + +# Given a set of in-scope bindings and a set of target bindings, synthesize +# a list of expressions that uses only the in-scope bindings (bindings) that +# have all of the types of goals. You may want to use this function if +# you're generating code for a function like: +# +# void f({args}) { +# g({exprs}); // g is a different API +# } +# +# and you need to generate "exprs". +# +# Typically, a list of Bindings is convenient to get (you usually call something +# like arguments() to get them); but technically you only need less information: +# for 'bindings' an (un-ordered) list of Exprs is sufficient; similarly, for +# 'goals', an (ordered) list of NamedCType goals is sufficient. If you are doing +# something more complicated, e.g., tracking the set of bindings in a context, +# you may find using these smaller types more convenient. +def translate( + bindings: Sequence[Expr | Binding], + goals: Sequence[NamedCType | Binding], + *, + method: bool = False, + allow_expensive_conversions: bool = False, +) -> list[Expr]: + binding_exprs: list[Expr] = [] + for b in bindings: + if isinstance(b, Binding): + binding_exprs.append( + Expr( + expr=b.name, + type=b.nctype, + ) + ) + else: + binding_exprs.append(b) + + goal_ctypes: list[NamedCType] = [] + for g in goals: + if isinstance(g, Binding): + goal_ctypes.append(g.nctype) + else: + goal_ctypes.append(g) + + # Add all the bindings to the context + ctx: dict[NamedCType, str] = {} + for b in binding_exprs: + ctx[b.type] = b.expr + + # While we're at it, do some simple forward inference, looking through + # constructors. + # + # NB: When should you do forward inference versus backward inference? + # The general idea: + # + # - Backward inference WHEN the goal gets smaller + # - Forward inference WHEN the hypothesis gets smaller + # + # This helps ensure termination: backward inference starts with a goal + # and tries to make it simpler and simpler until it's trivial; if the + # goal can grow in size, we blow up to a really huge goal size. + # Similarly, with forward inference we take hypotheses and decompose + # them into simpler hypotheses; if hypotheses could expand in size, + # we also have potential nontermination. (In the code below, forward + # inference is only ever carried out at a single step, but you could + # imagine repeated application of forward inference being profitable.) + # + # A good starting point in the literature for exploring more about proof + # search are these lecture notes + # https://www.cs.cmu.edu/~fp/courses/oregon-m10/04-focusing.pdf + # + # TODO: My kingdom for a pattern matcher + # https://www.python.org/dev/peps/pep-0634/ + # + # TODO: This could get us in recomputation trouble if b.expr is nontrivial. + # Fix this by implementing some sort of sharing so that if multiple + # goals share the same expression, we only compute it once. This seems + # to matter in practice as compiler is often unwilling to CSE nontrivial + # expressions like scalar.to() + t = b.type + if ( + isinstance(t, ConstRefCType) + and isinstance(t.elem, OptionalCType) + and isinstance(t.elem.elem, BaseCType) + and str(t.elem.elem.type) == "at::Tensor" + ): + ctx[ + NamedCType(t.elem.elem.name, ConstRefCType(BaseCType(tensorT))) + ] = f"({b.expr}.has_value() ? *{b.expr} : at::Tensor())" + + if t.type == ConstRefCType(OptionalCType(BaseCType(tensorT))): + ctx[ + NamedCType(t.name, BaseCType(optionalTensorRefT)) + ] = f"(({b.expr}.has_value() && (*{b.expr}).defined()) ? at::OptionalTensorRef(*{b.expr}) : at::OptionalTensorRef())" + + if t.type == ConstRefCType(BaseCType(scalarT)): + ctx[NamedCType(t.name, BaseCType(opmath_t))] = f"({b.expr}).to()" + + if t.type == ConstRefCType(OptionalCType(BaseCType(scalarT))): + ctx[ + NamedCType(t.name, BaseCType(optionalScalarRefT)) + ] = f"({b.expr}.has_value() ? at::OptionalScalarRef(&({b.expr}.value())) : at::OptionalScalarRef())" + + if t.type == BaseCType(scalar_t): + ctx[ + NamedCType(t.name, BaseCType(opmath_t)) + ] = f"static_cast({b.expr})" + + # [Note: IOptTensorListRef] + if t.type == ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))): + ctx[ + NamedCType(t.name, BaseCType(iOptTensorListRefT)) + ] = f"at::IOptTensorListRef({b.expr})" + + # Add implicit bindings if the generated code is inside a Tensor method + if method: + ctx[ + NamedCType("self", MutRefCType(BaseCType(tensorT))) + ] = "const_cast(*this)" + ctx[ + NamedCType("self", ConstRefCType(BaseCType(tensorT))) + ] = "const_cast(*this)" + # This is better! Byte-for-byte compat + # ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "*this" + + def unsat(goal: NamedCType) -> NoReturn: + ctx_desc = "\n".join( + f" {t.cpp_type()} {t.name}; // {e}" for t, e in ctx.items() + ) + raise UnsatError( + f""" +Failed to synthesize the expression "{goal.cpp_type()} {goal.name}". +When I failed, the following bindings were available in the context: + +{ctx_desc} + +This probably means there is a missing rule in the rules of torchgen.api.translate. +Check this module for more information. +""" + ) + + # A shitty backtracking search implementation. It's shitty because it + # does backtracking via stack (bad idea!) and for the most part tries to + # avoid backtracking. In particular, if + # direct=True, we won't try to do any fancy synthesis, just trivial + # conversions (e.g., "T a" is OK for "const T& a"). So all of the + # existing rules in this function simply try to solve immediately, + # and bail if things don't work out. + def solve(goal: NamedCType, *, direct: bool) -> str: + def direct_solve(goal: NamedCType) -> str: + return solve(goal, direct=True) + + if goal in ctx: + # Trivial + return ctx[goal] + + # const & is satisfied with mutable & + if isinstance(goal.type, ConstRefCType): + try: + # WARNING: not strictly decreasing; be careful not + # to add a direct conversion that goes satisfies + # mutable& with const& + return solve( + NamedCType(goal.name, MutRefCType(goal.type.elem)), direct=direct + ) + except UnsatError: + pass + + # mutable & is satisfied with value + if isinstance(goal.type, MutRefCType): + try: + return solve(NamedCType(goal.name, goal.type.elem), direct=direct) + except UnsatError: + pass + + # TODO: These are referentially equal, shouldn't have to do this; + # ensuring we don't use type synonym IntArrayRef in codegen would + # help + if goal.type == ArrayRefCType(BaseCType(longT)): + return solve(NamedCType(goal.name, BaseCType(intArrayRefT)), direct=direct) + + if direct: + unsat(goal) + + # For now, all of these rules are mutually exclusive. + if goal == NamedCType("memory_format", OptionalCType(BaseCType(memoryFormatT))): + memory_format = direct_solve( + NamedCType( + SpecialArgName.possibly_redundant_memory_format, + OptionalCType(BaseCType(memoryFormatT)), + ) + ) + # No need to join "memory_format" and "options" if the target API takes "options" directly. + # Otherwise it will cause the redundant memory_format error. + if options_ctype in goal_ctypes: + return memory_format + try: + options = direct_solve(options_ctype) + return f"c10::impl::check_tensor_options_and_extract_memory_format({options}, {memory_format})" + except UnsatError: + return memory_format + elif goal == NamedCType("options", BaseCType(tensorOptionsT)): + dtype = direct_solve( + NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))) + ) + pin_memory = direct_solve( + NamedCType("pin_memory", OptionalCType(BaseCType(boolT))) + ) + device = direct_solve( + NamedCType("device", OptionalCType(BaseCType(deviceT))) + ) + layout = direct_solve( + NamedCType("layout", OptionalCType(BaseCType(layoutT))) + ) + return f"TensorOptions().dtype({dtype}).layout({layout}).device({device}).pinned_memory({pin_memory})" + + elif goal == NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))): + try: + options = direct_solve(options_ctype) + return f"c10::optTypeMetaToScalarType({options}.dtype_opt())" + except UnsatError: + out_tensor = direct_solve(out_tensor_ctype) + return f"{out_tensor}.scalar_type()" + + elif goal == NamedCType("layout", OptionalCType(BaseCType(layoutT))): + try: + options = direct_solve(options_ctype) + return f"{options}.layout_opt()" + except UnsatError: + out_tensor = direct_solve(out_tensor_ctype) + return f"{out_tensor}.layout()" + + elif goal == NamedCType("device", OptionalCType(BaseCType(deviceT))): + try: + options = direct_solve(options_ctype) + return f"{options}.device_opt()" + except UnsatError: + out_tensor = direct_solve(out_tensor_ctype) + return f"{out_tensor}.device()" + + elif goal == NamedCType("pin_memory", OptionalCType(BaseCType(boolT))): + try: + options = direct_solve(options_ctype) + return f"{options}.pinned_memory_opt()" + except UnsatError: + # If we're calling a factory op from its out= variant, + # We don't actually care about the value of pin_memory. + out_tensor = direct_solve(out_tensor_ctype) + return "::std::nullopt" + + # We can always do translations from value types to reference types, like vector -> IntArrayRef + elif goal.type == BaseCType(intArrayRefT): + try: + return direct_solve(NamedCType(goal.name, longVec_ctype)) + except UnsatError: + # We can also go SymIntArrayRef -> IntArrayRef + symIntArrayRef_type = direct_solve( + NamedCType(goal.name, BaseCType(symIntArrayRefT)) + ) + return f"C10_AS_INTARRAYREF_SLOW({symIntArrayRef_type})" + elif goal.type == BaseCType(symIntArrayRefT): + try: + r = direct_solve(NamedCType(goal.name, BaseCType(intArrayRefT))) + return f"c10::fromIntArrayRefSlow({r})" + except UnsatError: + return direct_solve(NamedCType(goal.name, longSymVec_ctype)) + elif goal.type == BaseCType(SymIntT): + return direct_solve(NamedCType(goal.name, BaseCType(longT))) + elif goal.type == OptionalCType(BaseCType(SymIntT)): + argname = direct_solve( + NamedCType(goal.name, OptionalCType(BaseCType(longT))) + ) + return f"{argname}.has_value() ? ::std::make_optional(c10::SymInt(*{argname})) : ::std::nullopt" + elif goal.type == BaseCType(longT): + symInt_type = direct_solve(NamedCType(goal.name, BaseCType(SymIntT))) + return f"{symInt_type}.guard_int(__FILE__, __LINE__)" + elif goal.type == OptionalCType(BaseCType(longT)): + argname = direct_solve( + NamedCType(goal.name, OptionalCType(BaseCType(SymIntT))) + ) + return f"{argname}.has_value() ? ::std::make_optional({argname}->guard_int(__FILE__, __LINE__)) : ::std::nullopt" + elif goal.type == BaseCType(optionalIntArrayRefT): + try: + return direct_solve(NamedCType(goal.name, optionalLongVec_ctype)) + except UnsatError: + argname = direct_solve( + NamedCType(goal.name, BaseCType(optionalSymIntArrayRefT)) + ) + return f"{argname}.has_value() ? ::std::make_optional(C10_AS_INTARRAYREF_SLOW(*{argname})) : ::std::nullopt" + elif goal.type == BaseCType(optionalSymIntArrayRefT): + # TODO: You might also want to solve this from longSymVec_ctype or + # an optional version of it + argname = direct_solve( + NamedCType(goal.name, BaseCType(optionalIntArrayRefT)) + ) + return f"{argname}.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*{argname})) : ::std::nullopt" + elif goal.type == BaseCType(optionalScalarRefT): + return direct_solve(NamedCType(goal.name, optionalScalar_ctype)) + elif goal.type == BaseCType(optionalTensorRefT): + return direct_solve(NamedCType(goal.name, optionalTensor_ctype)) + + # Note [translation from C++ reference to value types] + # The below cases are all for when we have an argument with a reference type, + # and a corresponding goal with a value type. + # These are needed when we populate the inputs to a lambda capture and we need + # to guarantee the lifetime of each captured argument. + # We guard it with an explicit kwarg because converting to a value type is expensive + # (O(n)) to convert from IntArrayRef to vector), + # so the caller of translate() should be explicit that they need it. + if allow_expensive_conversions: + if goal.type == VectorCType(BaseCType(longT)): + intArrayRef_ctype = NamedCType(goal.name, BaseCType(intArrayRefT)) + argname = direct_solve(intArrayRef_ctype) + return f"{argname}.vec()" + if goal.type == VectorCType(BaseCType(SymIntT)): + symIntArrayRef_ctype = NamedCType(goal.name, BaseCType(symIntArrayRefT)) + argname = direct_solve(symIntArrayRef_ctype) + return f"{argname}.vec()" + elif goal.type == OptionalCType(VectorCType(BaseCType(longT))): + optionalIntArrayRef_ctype = NamedCType( + goal.name, BaseCType(optionalIntArrayRefT) + ) + argname = direct_solve(optionalIntArrayRef_ctype) + return f"{argname}.has_value() ? ::std::make_optional({argname}->vec()) : ::std::nullopt" + elif goal.type == OptionalCType(BaseCType(scalarT)): + optionalScalarRef_ctype = NamedCType( + goal.name, BaseCType(optionalScalarRefT) + ) + argname = direct_solve(optionalScalarRef_ctype) + return f"{argname}.has_value() ? ::std::make_optional({argname}) : ::std::nullopt" + elif goal.type == OptionalCType(BaseCType(scalarT)): + optionalTensorRef_ctype = NamedCType( + goal.name, BaseCType(optionalTensorRefT) + ) + argname = direct_solve(optionalTensorRef_ctype) + return f"{argname}.has_value() ? ::std::make_optional({argname}) : ::std::nullopt" + # Technically, we also need to handle cases of C++ containers holding reference types. + # But there currently aren't any ops that require lambda capture codegen + # With arguments like ::std::vector. + # If that changes, we'll have to add the translation here. + + # We allow const casting on tensors, since const-correctness is a bit broken for at::Tensor. + # We could probably generalize this to non-tensor types too. + if goal.type == MutRefCType(BaseCType(tensorT)): + const_ref_tensor_ctype = NamedCType( + goal.name, ConstRefCType(BaseCType(tensorT)) + ) + argname = direct_solve(const_ref_tensor_ctype) + return f"const_cast({argname})" + + unsat(goal) + + return [Expr(solve(g, direct=False), g) for g in goal_ctypes] diff --git a/torchgen/api/types/__init__.py b/torchgen/api/types/__init__.py new file mode 100644 index 00000000000..4e98bb8df49 --- /dev/null +++ b/torchgen/api/types/__init__.py @@ -0,0 +1,5 @@ +from torchgen.api.types.types import * +from torchgen.api.types.types_base import * + + +from torchgen.api.types.signatures import * # usort: skip diff --git a/torchgen/api/types/signatures.py b/torchgen/api/types/signatures.py new file mode 100644 index 00000000000..f7d85ca6e2f --- /dev/null +++ b/torchgen/api/types/signatures.py @@ -0,0 +1,426 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Iterator, Sequence, TYPE_CHECKING + +from torchgen.api.types.types_base import Binding, CType, Expr + + +if TYPE_CHECKING: + from torchgen.model import ( + BackendIndex, + FunctionSchema, + NativeFunction, + NativeFunctionsGroup, + NativeFunctionsViewGroup, + ) + + +@dataclass(frozen=True) +class CppSignature: + """ + A CppSignature represents a single overload in the C++ API. For + any given function schema, there may be multiple CppSignatures + corresponding to it, based on how we desugar to C++. See also + CppSignatureGroup. + """ + + # The schema this signature is derived from + func: FunctionSchema + + # Is this a C++ signature for a method, i.e. Tensor::my_op(...)? + method: bool + + # Is this a faithful C++ signature (i.e. following the JIT schema) or a convenience API + # (i.e. with a potential TensorOptions argument and out arguments in the front) + faithful: bool + + # Is this a symint C++ signature. For BC reasons, functions that take + # SymInts still present as int64_t in C++, and the SymInt variant is + # offered at a different overload name + # + # NB: If a function RETURNS a SymInt, this is ALWAYS false + symint: bool + + # The set of C++ arguments which should not have defaults applied to them + cpp_no_default_args: set[str] + + # Is this a fallback C++ binding? Fallback bindings are enabled by + # manual_cpp_binding: True and are alternate, non-public API that + # lets manual C++ binding implementors access the binding that would + # have been automatically generated + fallback_binding: bool = False + + # Return the unpacked argument structure of this signature, + # discarding information about which arguments are semantically + # related to each other. + def arguments(self) -> Sequence[Binding]: + return cpp.arguments( + self.func.arguments, + faithful=self.faithful, + symint=self.symint, + method=self.method, + cpp_no_default_args=self.cpp_no_default_args, + ) + + def name(self, *, suppress_symint_suffix: bool = False) -> str: + n = cpp.name( + self.func, + faithful_name_for_out_overloads=self.faithful, + symint_overload=False if suppress_symint_suffix else self.symint, + ) + if self.fallback_binding: + n = f"__dispatch_{n}" + return n + + # Render the C++ declaration for this signature + def decl( + self, + *, + name: str | None = None, + prefix: str = "", + is_redispatching_fn: bool = False, + suppress_symint_suffix: bool = False, + ) -> str: + returns_type = cpp.returns_type( + self.func.returns, symint=self.symint + ).cpp_type() + cpp_args = [a.decl() for a in self.arguments()] + if is_redispatching_fn: + cpp_args = ["c10::DispatchKeySet dispatchKeySet"] + cpp_args + cpp_args_str = ", ".join(cpp_args) + if name is None: + name = prefix + self.name(suppress_symint_suffix=suppress_symint_suffix) + return f"{returns_type} {name}({cpp_args_str})" + + # Render the C++ definition for this signature, not including + # the body (with curly braces) + def defn( + self, + *, + name: str | None = None, + prefix: str = "", + is_redispatching_fn: bool = False, + ) -> str: + returns_type = cpp.returns_type( + self.func.returns, symint=self.symint + ).cpp_type() + cpp_args = [a.defn() for a in self.arguments()] + if is_redispatching_fn: + cpp_args = ["c10::DispatchKeySet dispatchKeySet"] + cpp_args + cpp_args_str = ", ".join(cpp_args) + if name is None: + name = prefix + self.name() + return f"{returns_type} {name}({cpp_args_str})" + + def ptr_type(self) -> str: + args_types_str = ", ".join(a.type for a in self.arguments()) + return f"{cpp.returns_type(self.func.returns, symint=self.symint).cpp_type()} (*)({args_types_str})" + + # Return the C++ function type, e.g., something like int(bool) + def type(self) -> str: + args_types_str = ", ".join(a.type for a in self.arguments()) + return f"{cpp.returns_type(self.func.returns, symint=self.symint).cpp_type()} ({args_types_str})" + + +# Represents group of all CppSignatures associated with a +# FunctionSchema. Right now, that's the regular, user-visible +# signature, as well as a "faithful" signature which doesn't +# have grouping. +@dataclass(frozen=True) +class CppSignatureGroup: + func: FunctionSchema + signature: CppSignature + faithful_signature: CppSignature | None + symint_signature: CppSignature | None + symint_faithful_signature: CppSignature | None + + def most_faithful_signature(self) -> CppSignature: + if self.faithful_signature: + return self.faithful_signature + else: + return self.signature + + def signatures(self, *, symint: bool = True) -> Iterator[CppSignature]: + yield self.signature + if self.faithful_signature: + yield self.faithful_signature + if symint: + if self.symint_signature: + yield self.symint_signature + if self.symint_faithful_signature: + yield self.symint_faithful_signature + + @staticmethod + def from_native_function( + f: NativeFunction, *, method: bool, fallback_binding: bool = False + ) -> CppSignatureGroup: + func = f.func + + def make_sig(*, faithful: bool, symint: bool) -> CppSignature: + return CppSignature( + func=func, + faithful=faithful, + symint=symint, + method=method, + fallback_binding=fallback_binding, + cpp_no_default_args=f.cpp_no_default_args, + ) + + def make_sigs(*, symint: bool) -> tuple[CppSignature, CppSignature | None]: + faithful_signature: CppSignature | None = None + if func.arguments.tensor_options is not None or len(func.arguments.out) > 0: + faithful_signature = make_sig(faithful=True, symint=symint) + signature = make_sig(faithful=False, symint=symint) + return signature, faithful_signature + + signature, faithful_signature = make_sigs(symint=False) + symint_signature: CppSignature | None = None + symint_faithful_signature: CppSignature | None = None + if func.has_symint(): + symint_signature, symint_faithful_signature = make_sigs(symint=True) + + return CppSignatureGroup( + func=func, + signature=signature, + faithful_signature=faithful_signature, + symint_signature=symint_signature, + symint_faithful_signature=symint_faithful_signature, + ) + + +@dataclass(frozen=True) +class DispatcherSignature: + # The schema this signature is derived from + func: FunctionSchema + + # Allows you to prepend an arbitrary prefix to the signature name. + # This is useful for parts of the codegen that generate wrappers around kernels, + # and need to avoid naming collisions. + prefix: str = "" + + symint: bool = True + + def arguments(self) -> list[Binding]: + return dispatcher.arguments(self.func, symint=self.symint) + + def name(self) -> str: + return self.prefix + dispatcher.name(self.func) + + def decl(self, name: str | None = None) -> str: + args_str = ", ".join(a.decl() for a in self.arguments()) + if name is None: + name = self.name() + return f"{self.returns_type().cpp_type()} {name}({args_str})" + + def defn( + self, name: str | None = None, *, is_redispatching_fn: bool = False + ) -> str: + args = [a.defn() for a in self.arguments()] + if is_redispatching_fn: + args = ["c10::DispatchKeySet dispatchKeySet"] + args + args_str = ", ".join(args) + if name is None: + name = self.name() + return f"{self.returns_type().cpp_type()} {name}({args_str})" + + def exprs(self) -> list[Expr]: + return [Expr(a.name, a.nctype) for a in self.arguments()] + + def returns_type(self) -> CType: + return dispatcher.returns_type(self.func.returns, symint=self.symint) + + def ptr_type(self) -> str: + dispatcher_args_types_str = ", ".join(a.type for a in self.arguments()) + return f"{self.returns_type().cpp_type()} (*)({dispatcher_args_types_str})" + + # Return the C++ function type, e.g., something like int(bool) + def type(self) -> str: + dispatcher_args_types_str = ", ".join(a.type for a in self.arguments()) + return f"{self.returns_type().cpp_type()} ({dispatcher_args_types_str})" + + @staticmethod + def from_schema( + func: FunctionSchema, *, prefix: str = "", symint: bool = True + ) -> DispatcherSignature: + return DispatcherSignature(func, prefix, symint) + + +@dataclass(frozen=True) +class NativeSignature: + # The schema this signature is derived from + func: FunctionSchema + + symint: bool + + prefix: str = "" + + def name(self) -> str: + return self.prefix + native.name(self.func) + + def decl(self, name: str | None = None) -> str: + args_str = ", ".join(a.decl() for a in self.arguments()) + if name is None: + name = self.name() + return f"{native.returns_type(self.func.returns, symint=self.symint).cpp_type()} {name}({args_str})" + + def defn(self, name: str | None = None) -> str: + args_str = ", ".join(a.defn() for a in self.arguments()) + if name is None: + name = self.name() + return f"{native.returns_type(self.func.returns, symint=self.symint).cpp_type()} {name}({args_str})" + + def ptr_type(self) -> str: + # don't include defaults in type signature! + args_str = ", ".join(a.defn() for a in self.arguments()) + return f"{native.returns_type(self.func.returns, symint=self.symint).cpp_type()} (*)({args_str})" + + def arguments(self) -> list[Binding]: + return native.arguments(self.func, symint=self.symint) + + def returns_type(self) -> CType: + return native.returns_type(self.func.returns, symint=self.symint) + + def dispatcher_exprs(self) -> list[Expr]: + return translate.translate( + self.arguments(), dispatcher.arguments(self.func), method=False + ) + + +@dataclass(frozen=True) +class ViewInverseSignature: + g: NativeFunctionsViewGroup + + def name(self) -> str: + return functionalization.reverse_name(self.g.view, include_namespace=False) + + def decl(self) -> str: + return_type = functionalization.returns_type(self.g.view.func) + decls = [ + a.decl() + for a in functionalization.inner_arguments( + self.g.view.func, is_reverse=True + ) + ] + return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});" + + +@dataclass(frozen=True) +class FunctionalizationLambda: + g: NativeFunctionsViewGroup + + # are we generating the forward lambda or the reverse lambda? + is_reverse: bool + + def captures(self) -> list[Expr]: + # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments + # We also need to read the "reapply views" TLS at the time that the functionalization kernel was executed, + # and plumb it into the lambda. + outer_ctx = dispatcher.arguments(self.g.view.func) + [ + functionalization.reapply_views_binding, + functionalization.inverse_return_mode_binding, + ] + capture_bindings = functionalization.capture_arguments( + self.g.view.func, is_reverse=self.is_reverse + ) + # allow_expensive_conversions is set because we want to convert + # some reference types (IntArrayRef) to value types (vector). + capture_exprs = translate.translate( + outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True + ) + return capture_exprs + + def decl(self) -> str: + return_type = functionalization.returns_type(self.g.view.func) + capture_str = ", ".join( + f"{val.type.name} = {val.expr}" for val in self.captures() + ) + decls = [ + a.decl() + for a in functionalization.outer_arguments(is_reverse=self.is_reverse) + ] + return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}" + + def inner_call(self, *, reapply_views: bool | None = None) -> str: + inner_call_name = functionalization.name( + self.g, + is_reverse=self.is_reverse, + include_namespace=True, + reapply_views=reapply_views, + ) + + arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse) + capture_ctx = functionalization.capture_arguments( + self.g.view.func, is_reverse=self.is_reverse + ) + full_ctx = arg_ctx + capture_ctx + + assert self.g.view_copy is not None + call_bindings = functionalization.inner_arguments( + self.g.view_copy.func, is_reverse=self.is_reverse + ) + maybe_index = functionalization.inner_call_index(self.g.view_copy.func) + call_exprs = [ + e.expr for e in translate.translate(full_ctx, call_bindings, method=False) + ] + if not self.is_reverse and maybe_index is not None: + return f'{inner_call_name}({", ".join(call_exprs)})[{maybe_index.name}];' + else: + return f'{inner_call_name}({", ".join(call_exprs)});' + + @staticmethod + def from_func( + g: NativeFunctionsViewGroup, *, is_reverse: bool + ) -> FunctionalizationLambda: + return FunctionalizationLambda(g, is_reverse) + + +@dataclass(frozen=True) +class StructuredImplSignature: + g: NativeFunctionsGroup + name: str + + def defn(self, name: str | None = None) -> str: + args_str = ", ".join(a.defn() for a in self.arguments()) + return f"TORCH_IMPL_FUNC({self.name})({args_str})" + + def arguments(self) -> list[Binding]: + return structured.impl_arguments(self.g) + + +# Helper functions + + +def kernel_signature( + f: NativeFunction, backend_index: BackendIndex, *, prefix: str = "" +) -> NativeSignature | DispatcherSignature: + # Note [External Backends Follow Dispatcher API] + # Kernel signatures for in-tree backends follow the "native" API, + # while kernels for out-of-tree backends follow the dispatcher API. + # See the comments in `native.py` for details, but historically there have been + # some small differences in schema convention between them and the Dispatcher API. + # Any differences that require translating between the two will results in a runtime cost, + # so we'd like to keep the differences as small as possible. + # With external backends, we'd like to enforce that they write their kernels with schemas + # that match the Dispatcher API directly, if they can. + meta = backend_index.get_kernel(f) + symint = meta is not None and meta.supports_symint() + if symint: + assert ( + f.func.has_symint() + ), f"attempted to define symint kernel for {backend_index.dispatch_key} without SymInt in schema" + if backend_index.external: + return DispatcherSignature.from_schema(f.func, prefix=prefix, symint=symint) + else: + return NativeSignature(f.func, prefix=prefix, symint=symint) + + +# Functions only, no types +from torchgen.api import ( + cpp, + dispatcher, + functionalization, + native, + structured, + translate, +) diff --git a/torchgen/api/types/types.py b/torchgen/api/types/types.py new file mode 100644 index 00000000000..30e027a6312 --- /dev/null +++ b/torchgen/api/types/types.py @@ -0,0 +1,191 @@ +""" +Where should I add a new type? `types_base.py` vs `types.py` + +This file defines data model classes for torchgen typing system, as well as some base types such as int32_t. + +`types.py` defines ATen Tensor type and some c10 types, along with signatures that use these types. + +The difference between these two files, is `types_base.py` should be implementation-agnostic, meaning it shouldn't +contain any type definition that is tight to a specific C++ library (e.g., ATen), so that it can be easily reused +if we want to generate code for another C++ library. + +Add new types to `types.py` if these types are ATen/c10 related. +Add new types to `types_base.py` if they are basic and not attached to ATen/c10. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from torchgen.api.types.types_base import ( + BaseCppType, + BaseCType, + boolT, + byteT, + charT, + CType, + doubleT, + floatT, + int32T, + longT, + shortT, +) +from torchgen.model import BaseTy, ScalarType + + +TENSOR_LIST_LIKE_CTYPES = [ + "at::TensorList", + "const c10::List<::std::optional> &", + "const at::ITensorListRef &", +] + + +halfT = BaseCppType("at", "Half") +complexHalfT = BaseCppType( + "c10", "complex" +) # stuffing template param here is an abuse +complexFloatT = BaseCppType("c10", "complex") +complexDoubleT = BaseCppType("c10", "complex") +bfloat16T = BaseCppType("at", "BFloat16") +float8_e5m2T = BaseCppType("at", "Float8_e5m2") +float8_e5m2fnuzT = BaseCppType("at", "Float8_e5m2fnuz") +float8_e4m3fnT = BaseCppType("at", "Float8_e4m3fn") +float8_e4m3fnuzT = BaseCppType("at", "Float8_e4m3fnuz") +stringT = BaseCppType("c10", "string_view") +generatorT = BaseCppType("at", "Generator") +scalarTypeT = BaseCppType("at", "ScalarType") +tensorT = BaseCppType("at", "Tensor") +optionalTensorRefT = BaseCppType("at", "OptionalTensorRef") +tensorListT = BaseCppType("at", "TensorList") +iTensorListRefT = BaseCppType("at", "ITensorListRef") +iOptTensorListRefT = BaseCppType("at", "IOptTensorListRef") +dimnameT = BaseCppType("at", "Dimname") +dimnameListT = BaseCppType("at", "DimnameList") +dimVectorT = BaseCppType("at", "DimVector") +layoutT = BaseCppType("at", "Layout") +deviceT = BaseCppType("at", "Device") +deviceIndexT = BaseCppType("at", "DeviceIndex") +scalarT = BaseCppType("at", "Scalar") +optionalScalarRefT = BaseCppType("at", "OptionalScalarRef") +memoryFormatT = BaseCppType("at", "MemoryFormat") +qschemeT = BaseCppType("at", "QScheme") +storageT = BaseCppType("at", "Storage") +streamT = BaseCppType("at", "Stream") +intArrayRefT = BaseCppType("at", "IntArrayRef") +optionalIntArrayRefT = BaseCppType("at", "OptionalIntArrayRef") +optionalSymIntArrayRefT = BaseCppType("at", "OptionalSymIntArrayRef") +tensorOptionsT = BaseCppType("at", "TensorOptions") +typeAndSizeT = BaseCppType("torch::autograd::generated", "TypeAndSize") +tensorGeometryT = BaseCppType("at", "TensorGeometry") +SymIntT = BaseCppType("c10", "SymInt") +symIntArrayRefT = BaseCppType("c10", "SymIntArrayRef") + +# Types representing template parameters. Technically, we probably shouldn't +# represent them this way in codegen, but it was pretty convenient. +scalar_t = BaseCppType("", "scalar_t") +opmath_t = BaseCppType("", "opmath_t") + +ScalarTypeToCppMapping: dict[ScalarType, BaseCppType] = { + ScalarType.Byte: byteT, + ScalarType.Char: charT, + ScalarType.Short: shortT, + ScalarType.Int: int32T, + ScalarType.Long: longT, + ScalarType.Half: halfT, + ScalarType.Float: floatT, + ScalarType.Double: doubleT, + ScalarType.ComplexHalf: complexHalfT, + ScalarType.ComplexFloat: complexFloatT, + ScalarType.ComplexDouble: complexDoubleT, + ScalarType.Bool: boolT, + ScalarType.Float8_e5m2: float8_e5m2T, + ScalarType.Float8_e5m2fnuz: float8_e5m2fnuzT, + ScalarType.Float8_e4m3fn: float8_e4m3fnT, + ScalarType.Float8_e4m3fnuz: float8_e4m3fnuzT, +} + +BaseTypeToCppMapping: dict[BaseTy, BaseCppType] = { + BaseTy.int: longT, + BaseTy.float: doubleT, + BaseTy.bool: boolT, + BaseTy.str: stringT, + BaseTy.Generator: generatorT, + BaseTy.ScalarType: scalarTypeT, + BaseTy.Tensor: tensorT, + BaseTy.Dimname: dimnameT, + BaseTy.DimVector: dimVectorT, + BaseTy.Layout: layoutT, + BaseTy.Device: deviceT, + BaseTy.DeviceIndex: deviceIndexT, + BaseTy.Scalar: scalarT, + BaseTy.MemoryFormat: memoryFormatT, + BaseTy.QScheme: qschemeT, + BaseTy.Storage: storageT, + BaseTy.Stream: streamT, + BaseTy.SymInt: SymIntT, +} + +# CTypes encode C++ type structure as needed for translation. + + +@dataclass(frozen=True) +class OptionalCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f"::std::optional<{self.elem.cpp_type()}>" + + def cpp_type_registration_declarations(self) -> str: + return f"::std::optional<{self.elem.cpp_type_registration_declarations()}>" + + def remove_const_ref(self) -> CType: + return OptionalCType(self.elem.remove_const_ref()) + + +@dataclass(frozen=True) +class ListCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f"c10::List<{self.elem.cpp_type()}>" + + def cpp_type_registration_declarations(self) -> str: + return f"c10::List<{self.elem.cpp_type_registration_declarations()}>" + + def remove_const_ref(self) -> CType: + return ListCType(self.elem.remove_const_ref()) + + +@dataclass(frozen=True) +class ArrayRefCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f"at::ArrayRef<{self.elem.cpp_type()}>" + + def cpp_type_registration_declarations(self) -> str: + return f"ArrayRef<{self.elem.cpp_type_registration_declarations()}>" + + def remove_const_ref(self) -> CType: + return ArrayRefCType(self.elem.remove_const_ref()) + + +@dataclass(frozen=True) +class VectorizedCType(CType): + # This template is explicitly specialized, so the only valid + # elems are those we have specializations for (e.g., float, double, ...) + # scalar_t is also a common argument here (when we are codegen in + # a templated context) + elem: BaseCType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + return f"at::vec::Vectorized<{self.elem.cpp_type()}>" + + def cpp_type_registration_declarations(self) -> str: + raise NotImplementedError + + def remove_const_ref(self) -> CType: + return self diff --git a/torchgen/api/types/types_base.py b/torchgen/api/types/types_base.py new file mode 100644 index 00000000000..e031b79485e --- /dev/null +++ b/torchgen/api/types/types_base.py @@ -0,0 +1,276 @@ +""" +Where should I add a new type? `types_base.py` vs `types.py` + +This file defines data model classes for torchgen typing system, as well as some base types such as int32_t. + +`types.py` defines ATen Tensor type and some c10 types, along with signatures that use these types. + +The difference between these two files, is `types_base.py` should be implementation-agnostic, meaning it shouldn't +contain any type definition that is tight to a specific C++ library (e.g., ATen), so that it can be easily reused +if we want to generate code for another C++ library. + +Add new types to `types.py` if these types are ATen/c10 related. +Add new types to `types_base.py` if they are basic and not attached to ATen/c10. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import auto, Enum +from typing import TYPE_CHECKING, Union + + +if TYPE_CHECKING: + from torchgen.model import Argument, SelfArgument, TensorOptionsArguments + + +# An ArgName is just the str name of the argument in schema; +# but in some special circumstances, we may add a little extra +# context. The Enum SpecialArgName covers all of these cases; +# grep for their construction sites to see when they can occur. + + +class SpecialArgName(Enum): + possibly_redundant_memory_format = auto() + + +ArgName = Union[str, SpecialArgName] + + +# This class shouldn't be created directly; instead, use/create one of the singletons below. +@dataclass(frozen=True) +class BaseCppType: + ns: str | None + name: str + + def __str__(self) -> str: + if self.ns is None or self.ns == "": + return self.name + return f"{self.ns}::{self.name}" + + +# The set of all non-templated, valid, fully-qualified names of C++ types that are used in the codegen. +# Templated types get their own dataclass, mainly to make namespace parsing easier. +byteT = BaseCppType("", "uint8_t") +charT = BaseCppType("", "int8_t") +shortT = BaseCppType("", "int16_t") +# It would be more symmetric for this to be called intT, but it easy to mix +# this up with JIT int (which is int64_t in C++), so we intentionally don't +# define intT to make it obvious when you've stuffed it up +int32T = BaseCppType("", "int32_t") +longT = BaseCppType("", "int64_t") +doubleT = BaseCppType("", "double") +floatT = BaseCppType("", "float") +boolT = BaseCppType("", "bool") +voidT = BaseCppType("", "void") + + +class CType(ABC): + @abstractmethod + def cpp_type(self, *, strip_ref: bool = False) -> str: + raise NotImplementedError + + @abstractmethod + def cpp_type_registration_declarations(self) -> str: + raise NotImplementedError + + @abstractmethod + def remove_const_ref(self) -> CType: + return self + + +@dataclass(frozen=True) +class BaseCType(CType): + type: BaseCppType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + return str(self.type) + + # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml + # TODO: Kill this when we eventually remove it! + def cpp_type_registration_declarations(self) -> str: + return str(self.type).replace("at::", "") + + def remove_const_ref(self) -> CType: + return self + + +@dataclass(frozen=True) +class ConstRefCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + if strip_ref: + return self.elem.cpp_type(strip_ref=strip_ref) + return f"const {self.elem.cpp_type()} &" + + def cpp_type_registration_declarations(self) -> str: + return f"const {self.elem.cpp_type_registration_declarations()} &" + + def remove_const_ref(self) -> CType: + return self.elem.remove_const_ref() + + +@dataclass(frozen=True) +class VectorCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f"::std::vector<{self.elem.cpp_type()}>" + + def cpp_type_registration_declarations(self) -> str: + return f"::std::vector<{self.elem.cpp_type_registration_declarations()}>" + + def remove_const_ref(self) -> CType: + return VectorCType(self.elem.remove_const_ref()) + + +@dataclass(frozen=True) +class ArrayCType(CType): + elem: CType + size: int + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f"::std::array<{self.elem.cpp_type()},{self.size}>" + + def cpp_type_registration_declarations(self) -> str: + return f"::std::array<{self.elem.cpp_type_registration_declarations()},{self.size}>" + + def remove_const_ref(self) -> CType: + return ArrayCType(self.elem.remove_const_ref(), self.size) + + +@dataclass(frozen=True) +class TupleCType(CType): + elems: list[CType] + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f'::std::tuple<{",".join([e.cpp_type() for e in self.elems])}>' + + def cpp_type_registration_declarations(self) -> str: + return f'::std::tuple<{",".join([e.cpp_type_registration_declarations() for e in self.elems])}>' + + def remove_const_ref(self) -> CType: + return TupleCType([e.remove_const_ref() for e in self.elems]) + + +@dataclass(frozen=True) +class MutRefCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + if strip_ref: + return self.elem.cpp_type(strip_ref=strip_ref) + return f"{self.elem.cpp_type()} &" + + def cpp_type_registration_declarations(self) -> str: + return f"{self.elem.cpp_type_registration_declarations()} &" + + def remove_const_ref(self) -> CType: + return self.elem.remove_const_ref() + + +# A NamedCType is short for Named C++ semantic type. A NamedCType represents a C++ type, plus +# semantic information about what it represents. For example, consider the +# argument "bool pin_memory"; its normal C++ type is "bool", but its C++ +# semantic type also keeps track that this represents a "pin_memory"; you can't +# just use a random other boolean in a context where you need a "pin_memory"! +# + + +@dataclass(frozen=True) +class NamedCType: + name: ArgName + type: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + return self.type.cpp_type(strip_ref=strip_ref) + + # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml + # TODO: Kill this when we eventually remove it! + def cpp_type_registration_declarations(self) -> str: + return self.type.cpp_type_registration_declarations() + + def remove_const_ref(self) -> NamedCType: + return NamedCType(self.name, self.type.remove_const_ref()) + + def with_name(self, name: str) -> NamedCType: + return NamedCType(name, self.type) + + +# A binding represents any C++ binding site for a formal parameter. +# We don't distinguish between binding sites for different APIs; +# instead, all of the important distinctions are encoded in CType, +# which you can use to figure out if a given Binding is appropriate +# for use in another context. (See torchgen.api.translate) + + +@dataclass(frozen=True) +class Binding: + name: str + nctype: NamedCType + argument: Argument | TensorOptionsArguments | SelfArgument + # TODO: maybe don't represent default here + default: str | None = None + + def rename(self, name: str) -> Binding: + return Binding( + name=name, + nctype=self.nctype, + argument=self.argument, + default=self.default, + ) + + @property + def type(self) -> str: + return self.nctype.cpp_type() + + def no_default(self) -> Binding: + return Binding( + name=self.name, + nctype=self.nctype, + default=None, + argument=self.argument, + ) + + def decl(self, *, func_ptr_cast: bool = False) -> str: + mb_default = "" + if self.default is not None: + mb_default = f"={self.default}" + + # casting only needs to know the type + if func_ptr_cast: + return f"{self.type}" + else: + return f"{self.type} {self.name}{mb_default}" + + # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml + # TODO: Kill this when we eventually remove it! + def decl_registration_declarations(self) -> str: + type_s = self.nctype.cpp_type_registration_declarations() + mb_default = "" + if self.default is not None: + mb_default = f"={self.default}" + return f"{type_s} {self.name}{mb_default}" + + def defn(self) -> str: + return f"{self.type} {self.name}" + + def with_name(self, name: str) -> Binding: + return Binding( + name=name, nctype=self.nctype, argument=self.argument, default=self.default + ) + + +# An Expr is a C++ expression. It has a C++ string representing its syntax, +# as well as a CType saying what it provides. + + +@dataclass(frozen=True) +class Expr: + expr: str + type: NamedCType diff --git a/torchgen/api/ufunc.py b/torchgen/api/ufunc.py new file mode 100644 index 00000000000..17adcccecab --- /dev/null +++ b/torchgen/api/ufunc.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +from dataclasses import dataclass + +import torchgen.api.types as api_types +from torchgen.api import cpp, structured +from torchgen.api.types import ( + ArgName, + BaseCppType, + BaseCType, + Binding, + ConstRefCType, + CType, + NamedCType, + scalarT, +) +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + DispatchKey, + FunctionSchema, + NativeFunctionsGroup, + Type, +) + + +def schema_kernel_name(func: FunctionSchema, dispatch_key: DispatchKey) -> str: + assert func.is_out_fn(), "ufunc.kernel_name should only be invoked on out schemas" + return f"ufunc_{func.name.name}_{dispatch_key}" + + +def kernel_name(g: NativeFunctionsGroup, dispatch_key: DispatchKey) -> str: + return schema_kernel_name(g.out.func, dispatch_key) + + +# Tensors are omitted (as they are stored in TensorIterator), everything else is +# passed along (technically, we can pass tensors along too, it just wastes +# argument registers) +# +# NB: used for CPU only +def dispatchstub_type(t: Type, *, binds: ArgName) -> NamedCType | None: + # Dispatch stubs are always plain ints + r = cpp.valuetype_type(t, binds=binds, symint=False) + if r is not None: + return r + + if t == BaseType(BaseTy.Scalar): + return NamedCType(binds, ConstRefCType(BaseCType(scalarT))) + elif t == BaseType(BaseTy.Tensor): + return None + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +def opmath_type(scalar_t: BaseCppType) -> BaseCppType: + if scalar_t == api_types.scalar_t: + return api_types.opmath_t + raise NotImplementedError + + +# NB: Tensors in constructor are stored in opmath_t, not scalar_t +# because Tensor in constructor = its a scalar tensor partially applied = +# it can be higher precision and we want to compute in that higher precision +# +# NB: CUDA only +def ufunctor_ctor_type(t: Type, *, binds: ArgName, scalar_t: BaseCppType) -> NamedCType: + r = cpp.valuetype_type(t, binds=binds, symint=False) + if r is not None: + return r + + if t == BaseType(BaseTy.Scalar): + return NamedCType(binds, BaseCType(opmath_type(scalar_t))) + elif t == BaseType(BaseTy.Tensor): + return NamedCType(binds, BaseCType(opmath_type(scalar_t))) + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +# Only Tensors ever get passed directly to operator() +# +# NB: CUDA only +# (Actually, this works for CPU too) +def ufunctor_apply_type( + t: Type, *, binds: ArgName, scalar_t: BaseCppType +) -> NamedCType: + if t == BaseType(BaseTy.Tensor): + return NamedCType(binds, BaseCType(scalar_t)) + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +# The actual ufunc template function the user writes. Everything here +# is done in the computation type. compute_t is opmath_t in CUDA and scalar_t +# in CPU +def ufunc_type(t: Type, *, binds: ArgName, compute_t: CType) -> NamedCType: + r = cpp.valuetype_type(t, binds=binds, symint=False) + if r is not None: + return r + + if t == BaseType(BaseTy.Scalar): + return NamedCType(binds, compute_t) + elif t == BaseType(BaseTy.Tensor): + return NamedCType(binds, compute_t) + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +def ufunctor_ctor_argument(a: Argument, scalar_t: BaseCppType) -> Binding: + return Binding( + nctype=ufunctor_ctor_type(a.type, binds=a.name, scalar_t=scalar_t), + name=a.name, + default=None, + argument=a, + ) + + +def ufunctor_apply_argument(a: Argument, scalar_t: BaseCppType) -> Binding: + return Binding( + nctype=ufunctor_apply_type(a.type, binds=a.name, scalar_t=scalar_t), + name=a.name, + default=None, + argument=a, + ) + + +def ufunc_argument(a: Argument, compute_t: CType) -> Binding: + return Binding( + nctype=ufunc_type(a.type, binds=a.name, compute_t=compute_t), + name=a.name, + default=None, + argument=a, + ) + + +@dataclass(frozen=True) +class UfunctorBindings: + ctor: list[Binding] + apply: list[Binding] + + +# ufunctors are a CUDA-only concept representing functors that take some of +# their arguments on a host-side constructor, and the rest in the device-side +# apply. E.g., +# +# template +# struct CUDAFunctorOnSelf_add { +# using opmath_t = at::opmath_type; +# opmath_t other_; +# opmath_t alpha_; +# CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha) : other_(other), alpha_(alpha) {} +# __device__ scalar_t operator()(scalar_t self) { +# return ufunc::add(static_cast(self), other_, alpha_); +# } +# }; +# +# The ctor refers to the constructor CUDAFunctorOnSelf_add, while apply refers +# to the operator() definition +def ufunctor_arguments( + g: NativeFunctionsGroup, *, scalar_tensor_idx: int | None, scalar_t: BaseCppType +) -> UfunctorBindings: + ctor = [] + apply = [] + for a in g.functional.func.arguments.flat_non_out: + if a.type.is_tensor_like(): + if scalar_tensor_idx == 0: + # put it in the ctor anyway + ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t)) + scalar_tensor_idx = None + else: + if scalar_tensor_idx is not None: + scalar_tensor_idx -= 1 + apply.append(ufunctor_apply_argument(a, scalar_t=scalar_t)) + else: + ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t)) + assert scalar_tensor_idx is None + return UfunctorBindings(ctor=ctor, apply=apply) + + +# ufuncs are the inner loop template functions that you wrote in ufunc/add.h +# which do the actual computation in question. E.g., +# +# template +# C10_HOST_DEVICE T add(T self, T other, T alpha) __ubsan_ignore_undefined__ { +# return self + alpha * other; +# } +# +# In this file, we refer to T as compute_t which is bound by caller +def ufunc_arguments(g: NativeFunctionsGroup, *, compute_t: CType) -> list[Binding]: + return [ + ufunc_argument(a, compute_t=compute_t) + for a in g.functional.func.arguments.flat_non_out + ] + + +# Stubs are the DispatchStub trampolines that CPU kernels use to get to their +# vectorized versions. E.g., +# +# using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha); +# DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub); +def stub_arguments(g: NativeFunctionsGroup) -> list[Binding]: + # stubs drop all tensor arguments (they are implicit in the TensorIterator + # argument and keep everything else) + return [ + r + for a in g.out.func.arguments.flat_non_out + if not a.type.is_tensor_like() + for r in structured.argument(a) + ] diff --git a/torchgen/api/unboxing.py b/torchgen/api/unboxing.py new file mode 100644 index 00000000000..1e649b75178 --- /dev/null +++ b/torchgen/api/unboxing.py @@ -0,0 +1,249 @@ +from __future__ import annotations + +from torchgen.api import cpp +from torchgen.api.types import Binding, CppSignatureGroup, CType +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + ListType, + NativeFunction, + OptionalType, + Type, +) + + +# This file generates the code for unboxing wrappers, i.e., the glue logic to unbox a boxed operator and convert the +# ivalues from stack to correct arguments to the unboxed kernel, based on corresponding JIT schema. This codegen is +# an alternative way to generate unboxing wrappers similar to the existing C++ metaprogramming approach but gets the +# job done statically. These generated unboxing wrappers will be useful under the scenario where we need to register +# a fixed set of operators known at compile time and thus can save some time in runtime initialization phase. +# +# Here's an example on how the codegen works: +# +# - Function Schema (source of truth) +# +# aten::empty.names(int[] size, *, Dimname[]? names, +# ScalarType? dtype=None, Layout? layout=None, +# Device? device=None, bool? pin_memory=None, +# MemoryFormat? memory_format=None) -> Tensor +# - Argument Conversion +# Generates C++ code to convert an ivalue (from stack) to its underlying C++ type. +# - int[] size +# ```cpp +# const c10::List size_list_in = (std::move(peek(stack, 0, 7))).toList(); +# +# std::vector size_vec; +# for (c10::IValue size_elem: size_list_in) { +# int64_t size_base = size_elem.to(); +# size_vec.push_back(size_base); +# } +# at::ArrayRef size_list_out(size_vec); +# ~~~~~~~~~~~~~ <-- The converted argument from ivalues in the stack. +# Will be passed to unboxed kernel. +# ``` +# - Dimname[]? names +# ```cpp +# ::std::optional names_opt = (std::move(peek(stack, 1, 7))).toOptional(); +# ::std::optional> names_opt_out; +# if (names_opt.has_value()) { +# ~~~~~~~~~~~ <-- Unwrapping optional shell +# const c10::IValue names_opt_in = names_opt.value(); +# const c10::List names_list_in = names_opt_in.toList(); +# +# std::vector names_vec; +# for (c10::IValue names_elem: names_list_in) { +# ~~~~~~~~~~~~~~~~~~~~~~~~~ <-- Unrolling list, then convert elements one by one. +# at::Dimname names_base = names_elem.to(); +# names_vec.push_back(names_base); +# } +# at::ArrayRef names_list_out(names_vec); +# +# names_opt_out = ::std::optional>(names_list_out); +# } else { +# names_opt_out = ::std::optional>(); +# } +# ``` +# - ScalarType? dtype (similarly for the rest of the arguments) +# ```cpp +# ::std::optional dtype_opt = (std::move(peek(stack, 2, 7))).toOptional(); +# ::std::optional dtype_opt_out; +# if (dtype_opt.has_value()) { +# const c10::IValue dtype_opt_in = dtype_opt.value(); +# at::ScalarType dtype_base = dtype_opt_in.to(); +# ~~~~~~~~~~~~~~~~~~~~ <-- For base types, convert ivalue to it +# directly using ".to()" API. +# dtype_opt_out = ::std::optional(dtype_base); +# } else { +# dtype_opt_out = ::std::optional(); +# } +# ``` +# +# - Unboxed Kernel Call +# ```cpp +# auto result_ = torch::empty( +# size_list_out, +# names_opt_out, +# options, +# memory_format_opt_out +# ); +# ``` +# +# - Push Result Back to Stack +# ```cpp +# drop(stack, 7); +# pack(stack, std::move(result_)); +# ``` +connector = "\n\t" + + +# Return unboxing function name for a NativeFunction +def name(f: NativeFunction) -> str: + return f.func.name.unambiguous_name() + + +# Convert all the arguments in a NativeFunction to C++ code +def convert_arguments(f: NativeFunction) -> tuple[list[Binding], list[str]]: + # we need the 'self' argument so method needs to be False + args = ( + CppSignatureGroup.from_native_function(f, method=False) + .most_faithful_signature() + .arguments() + ) + code_list = [ + f"c10::IValue {args[i].name} = std::move(peek(stack, {i}, {len(args)}));" + for i in range(len(args)) + ] + [""] + binding_list = [] + for arg in args: + # expecting only Argument + if not isinstance(arg.argument, Argument): + raise Exception( # noqa: TRY002 + f"Unexpected argument type, expecting `Argument` but got {arg}" + ) + argument: Argument = arg.argument + unboxed_name, _, code, decl = argumenttype_ivalue_convert( + argument.type, + argument.name, + mutable=argument.is_write, + ) + code_list.extend(decl) + code_list.extend(code) + binding_list.append(arg.with_name(unboxed_name)) + return binding_list, code_list + + +# Takes in the type, name and mutability corresponding to an argument, and generates a tuple of: +# (1) the C++ code necessary to unbox the argument +# (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType +def argumenttype_ivalue_convert( + t: Type, arg_name: str, *, mutable: bool = False +) -> tuple[str, CType, list[str], list[str]]: + # Unboxing is for mobile, which doesn't care about SymInts + ctype = cpp.argumenttype_type( + t=t, mutable=mutable, binds=arg_name, symint=False + ).type + + if isinstance(t, BaseType): + out_name = f"{arg_name}_base" + code, decl = _gen_code_base_type( + arg_name=arg_name, out_name=out_name, ctype=ctype + ) + elif isinstance(t, OptionalType): + out_name = f"{arg_name}_opt_out" + code, decl = _gen_code_optional_type( + arg_name=arg_name, + out_name=out_name, + t=t, + ctype=ctype, + ) + elif isinstance(t, ListType): + out_name = f"{arg_name}_list_out" + code, decl = _gen_code_list_type( + arg_name=arg_name, + out_name=out_name, + t=t, + ctype=ctype, + ) + else: + raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}") # noqa: TRY002 + return out_name, ctype, code, decl + + +def _gen_code_base_type( + arg_name: str, out_name: str, ctype: CType +) -> tuple[list[str], list[str]]: + return [ + f"{ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();" + ], [] + + +def _gen_code_optional_type( + arg_name: str, out_name: str, t: OptionalType, ctype: CType +) -> tuple[list[str], list[str]]: + in_name = f"{arg_name}_opt_in" + res_name, _, res_code, decl = argumenttype_ivalue_convert(t.elem, in_name) + return ( + f""" +auto {arg_name}_opt = {arg_name}.toOptional(); +{ctype.cpp_type(strip_ref=True)} {out_name}; +if ({arg_name}_opt.has_value()) {{ + const c10::IValue {in_name} = {arg_name}_opt.value(); + {connector.join(res_code)} + {out_name} = {ctype.cpp_type(strip_ref=True)}({res_name}); +}} else {{ + {out_name} = {ctype.cpp_type(strip_ref=True)}(); +}} + """.split( + "\n" + ), + decl, + ) + + +def _gen_code_list_type( + arg_name: str, out_name: str, t: ListType, ctype: CType +) -> tuple[list[str], list[str]]: + in_name = f"{arg_name}_list_in" + elem_name = f"{arg_name}_elem" + code = [f"const c10::List {in_name} = {arg_name}.toList();"] + res_name, res_ctype, res_code, decl = argumenttype_ivalue_convert(t.elem, elem_name) + # handle list type with size, e.g., bool[4] + if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool and t.size: + code.extend( + f""" +{ctype.cpp_type(strip_ref=True)} {out_name} = as_array<{res_ctype.cpp_type(strip_ref=True)}, {t.size}>({in_name}); + """.split( + "\n" + ) + ) + # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<::std::optional> + elif isinstance(t.elem, OptionalType): + code.extend( + f""" +{ctype.cpp_type(strip_ref=True)} {out_name}; +for (c10::IValue {elem_name}: {in_name}) {{ + {connector.join(res_code)} + {out_name}.push_back({res_name}); +}} + """.split( + "\n" + ) + ) + else: + # use ArrayRef as default. + vec_name = arg_name + "_vec" + # need to bring vector instantiation out of scope so that ArrayRef has valid data + decl.append(f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};") + code.extend( + f""" +for (c10::IValue {elem_name}: {in_name}) {{ + {connector.join(res_code)} + {vec_name}.push_back({res_name}); +}} +{ctype.cpp_type(strip_ref=True)} {out_name}({vec_name}); + """.split( + "\n" + ) + ) + return code, decl diff --git a/torchgen/build.bzl b/torchgen/build.bzl new file mode 100644 index 00000000000..2ec68955df9 --- /dev/null +++ b/torchgen/build.bzl @@ -0,0 +1,30 @@ +def define_targets(rules): + rules.py_library( + name = "torchgen", + srcs = rules.glob(["**/*.py"]), + visibility = ["//visibility:public"], + deps = [ + rules.requirement("PyYAML"), + rules.requirement("typing-extensions"), + ], + ) + + rules.py_binary( + name = "gen", + srcs = [":torchgen"], + visibility = ["//visibility:public"], + deps = [ + rules.requirement("PyYAML"), + rules.requirement("typing-extensions"), + ], + ) + + rules.py_binary( + name = "gen_executorch", + srcs = [":torchgen"], + visibility = ["//visibility:public"], + deps = [ + rules.requirement("PyYAML"), + rules.requirement("typing-extensions"), + ], + ) diff --git a/torchgen/code_template.py b/torchgen/code_template.py new file mode 100644 index 00000000000..cdb86a48064 --- /dev/null +++ b/torchgen/code_template.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import re +from typing import Mapping, Sequence + + +# match $identifier or ${identifier} and replace with value in env +# If this identifier is at the beginning of whitespace on a line +# and its value is a list then it is treated as +# block substitution by indenting to that depth and putting each element +# of the list on its own line +# if the identifier is on a line starting with non-whitespace and a list +# then it is comma separated ${,foo} will insert a comma before the list +# if this list is not empty and ${foo,} will insert one after. + + +class CodeTemplate: + substitution_str = r"(^[^\n\S]*)?\$([^\d\W]\w*|\{,?[^\d\W]\w*\,?})" + substitution = re.compile(substitution_str, re.MULTILINE) + + pattern: str + filename: str + + @staticmethod + def from_file(filename: str) -> CodeTemplate: + with open(filename) as f: + return CodeTemplate(f.read(), filename) + + def __init__(self, pattern: str, filename: str = "") -> None: + self.pattern = pattern + self.filename = filename + + def substitute( + self, env: Mapping[str, object] | None = None, **kwargs: object + ) -> str: + if env is None: + env = {} + + def lookup(v: str) -> object: + assert env is not None + return kwargs[v] if v in kwargs else env[v] + + def indent_lines(indent: str, v: Sequence[object]) -> str: + return "".join( + [indent + l + "\n" for e in v for l in str(e).splitlines()] + ).rstrip() + + def replace(match: re.Match[str]) -> str: + indent = match.group(1) + key = match.group(2) + comma_before = "" + comma_after = "" + if key[0] == "{": + key = key[1:-1] + if key[0] == ",": + comma_before = ", " + key = key[1:] + if key[-1] == ",": + comma_after = ", " + key = key[:-1] + v = lookup(key) + if indent is not None: + if not isinstance(v, list): + v = [v] + return indent_lines(indent, v) + elif isinstance(v, list): + middle = ", ".join([str(x) for x in v]) + if len(v) == 0: + return middle + return comma_before + middle + comma_after + else: + return str(v) + + return self.substitution.sub(replace, self.pattern) + + +if __name__ == "__main__": + c = CodeTemplate( + """\ + int foo($args) { + + $bar + $bar + $a+$b + } + int commatest(int a${,stuff}) + int notest(int a${,empty,}) + """ + ) + print( + c.substitute( + args=["hi", 8], + bar=["what", 7], + a=3, + b=4, + stuff=["things...", "others"], + empty=[], + ) + ) diff --git a/torchgen/context.py b/torchgen/context.py new file mode 100644 index 00000000000..a2031049816 --- /dev/null +++ b/torchgen/context.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import contextlib +import functools +from typing import Any, Callable, Iterator, List, Optional, Tuple, TypeVar, Union + +import torchgen.local as local +from torchgen.model import ( + BackendIndex, + DispatchKey, + NativeFunction, + NativeFunctionsGroup, + NativeFunctionsViewGroup, +) +from torchgen.utils import context, S, T + + +# Helper functions for defining generators on things in the model + +F = TypeVar( + "F", + NativeFunction, + NativeFunctionsGroup, + NativeFunctionsViewGroup, + Union[NativeFunction, NativeFunctionsGroup], + Union[NativeFunction, NativeFunctionsViewGroup], +) + +F2 = TypeVar( + "F2", + NativeFunction, + NativeFunctionsGroup, + Optional[NativeFunction], + bool, + str, +) + +F3 = TypeVar("F3", Tuple[NativeFunction, Any], List[NativeFunction]) + + +@contextlib.contextmanager +def native_function_manager( + g: NativeFunctionsGroup | NativeFunctionsViewGroup | NativeFunction, +) -> Iterator[None]: + if isinstance(g, NativeFunctionsGroup): + # By default, we associate all errors with structured native functions + # with the out variant. In some cases, it might be better to have + # a more specific place to hang things; if so, use + # native_function_manager again on the inside + f = g.out + elif isinstance(g, NativeFunctionsViewGroup): + # We associate errors with the view operator + f = g.view + else: + f = g + with context(lambda: f"in native_functions.yaml line {f.loc}:\n {f.func}"): + with local.parametrize( + use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors, + use_ilistref_for_tensor_lists=f.part_of_structured_group, + ): + yield + + +# Given a function that operates on NativeFunction, wrap it into a new function +# that sets some appropriate context managers for that native function. +# YOU MUST WRAP FUNCTIONS IN THIS for calls to api modules to be sound +# (you will get an error if we try to access the local variables without having +# set them). +def with_native_function(func: Callable[[F], T]) -> Callable[[F], T]: + @functools.wraps(func) + def wrapper(f: F) -> T: + with native_function_manager(f): + return func(f) + + return wrapper + + +def with_native_function_and(func: Callable[[F, F2], T]) -> Callable[[F, F2], T]: + @functools.wraps(func) + def wrapper(f: F, f2: F2) -> T: + # The first native_function is assumed to be the one with the appropriate context. + with native_function_manager(f): + return func(f, f2) + + return wrapper + + +def method_with_native_function(func: Callable[[S, F], T]) -> Callable[[S, F], T]: + @functools.wraps(func) + def wrapper(slf: S, f: F) -> T: + with native_function_manager(f): + return func(slf, f) + + return wrapper + + +def method_with_nested_native_function( + func: Callable[[S, F3], T] +) -> Callable[[S, F3], T]: + @functools.wraps(func) + def wrapper(slf: S, f: F3) -> T: + with native_function_manager(f[0]): + return func(slf, f) + + return wrapper + + +# Convenience decorator for functions that explicitly take in a BackendIndex, +# instead of indirectly taking one in as a closure +def with_native_function_and_index( + func: Callable[[F, BackendIndex], T] +) -> Callable[[F, BackendIndex], T]: + @functools.wraps(func) + def wrapper(f: F, backend_index: BackendIndex) -> T: + with native_function_manager(f): + return func(f, backend_index) + + return wrapper + + +# Convenience decorator for functions that explicitly take in a Dict of BackendIndices +def with_native_function_and_indices( + func: Callable[[F, dict[DispatchKey, BackendIndex]], T] +) -> Callable[[F, dict[DispatchKey, BackendIndex]], T]: + @functools.wraps(func) + def wrapper(f: F, backend_indices: dict[DispatchKey, BackendIndex]) -> T: + with native_function_manager(f): + return func(f, backend_indices) + + return wrapper diff --git a/torchgen/decompositions/gen_jit_decompositions.py b/torchgen/decompositions/gen_jit_decompositions.py new file mode 100644 index 00000000000..b42948045cb --- /dev/null +++ b/torchgen/decompositions/gen_jit_decompositions.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +import os +from pathlib import Path + +from torch.jit._decompositions import decomposition_table + + +# from torchgen.code_template import CodeTemplate + +DECOMP_HEADER = r""" +/** + * @generated + * This is an auto-generated file. Please do not modify it by hand. + * To re-generate, please run: + * cd ~/pytorch && python torchgen/decompositions/gen_jit_decompositions.py + */ +#include +#include +#include +#include + +namespace torch { +namespace jit { + + +const std::string decomp_funcs = +R"(""" + + +DECOMP_CENTER = r""" +)"; + +const std::string& GetSerializedDecompositions() { + return decomp_funcs; +} + +const OperatorMap& GetDecompositionMapping() { + // clang-format off + static const OperatorMap decomposition_mapping { +""" + +DECOMP_END = r""" + }; + // clang-format on + + return decomposition_mapping; +} + +} // namespace jit +} // namespace torch +""" + + +DECOMPOSITION_UTIL_FILE_NAME = "decomposition_registry_util.cpp" + + +def gen_serialized_decompisitions() -> str: + return "\n".join( + [scripted_func.code for scripted_func in decomposition_table.values()] # type: ignore[misc] + ) + + +def gen_decomposition_mappings() -> str: + decomposition_mappings = [] + for schema, scripted_func in decomposition_table.items(): + decomposition_mappings.append( + ' {"' + schema + '", "' + scripted_func.name + '"},' # type: ignore[operator] + ) + return "\n".join(decomposition_mappings) + + +def write_decomposition_util_file(path: str) -> None: + decomposition_str = gen_serialized_decompisitions() + decomposition_mappings = gen_decomposition_mappings() + file_components = [ + DECOMP_HEADER, + decomposition_str, + DECOMP_CENTER, + decomposition_mappings, + DECOMP_END, + ] + print("writing file to : ", path + "/" + DECOMPOSITION_UTIL_FILE_NAME) + with open(os.path.join(path, DECOMPOSITION_UTIL_FILE_NAME), "wb") as out_file: + final_output = "".join(file_components) + out_file.write(final_output.encode("utf-8")) + + +def main() -> None: + pytorch_dir = Path(__file__).resolve().parents[3] + upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "runtime" + write_decomposition_util_file(str(upgrader_path)) + + +if __name__ == "__main__": + main() diff --git a/torchgen/dest/__init__.py b/torchgen/dest/__init__.py new file mode 100644 index 00000000000..8f08a743ae2 --- /dev/null +++ b/torchgen/dest/__init__.py @@ -0,0 +1,19 @@ +from torchgen.dest.lazy_ir import ( + generate_non_native_lazy_ir_nodes as generate_non_native_lazy_ir_nodes, + GenLazyIR as GenLazyIR, + GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition, + GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition, +) +from torchgen.dest.native_functions import ( + compute_native_function_declaration as compute_native_function_declaration, +) +from torchgen.dest.register_dispatch_key import ( + gen_registration_headers as gen_registration_headers, + gen_registration_helpers as gen_registration_helpers, + RegisterDispatchKey as RegisterDispatchKey, +) +from torchgen.dest.ufunc import ( + compute_ufunc_cpu as compute_ufunc_cpu, + compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel, + compute_ufunc_cuda as compute_ufunc_cuda, +) diff --git a/torchgen/dest/lazy_ir.py b/torchgen/dest/lazy_ir.py new file mode 100644 index 00000000000..976c823a165 --- /dev/null +++ b/torchgen/dest/lazy_ir.py @@ -0,0 +1,707 @@ +from __future__ import annotations + +import itertools +from abc import ABC +from dataclasses import dataclass +from typing import Any + +import torchgen.api.dispatcher as dispatcher +from torchgen.api.lazy import ( + getValueT, + isValueType, + LazyArgument, + LazyIrProperties, + LazyIrSchema, + tensorListValueT, +) +from torchgen.api.translate import translate +from torchgen.api.types import ( + BaseCType, + Binding, + deviceT, + DispatcherSignature, + kernel_signature, + NativeSignature, + OptionalCType, + VectorCType, +) +from torchgen.context import method_with_native_function +from torchgen.dest.lazy_ts_lowering import ts_lowering_body +from torchgen.model import ( + Argument, + BackendIndex, + BackendMetadata, + BaseTy, + BaseType, + FunctionSchema, + ListType, + NativeFunction, + NativeFunctionsGroup, +) + + +def node_ctor_arg_rvalue_string(arg: LazyArgument) -> str: + """ + Given a LazyArgument, + generate a c++ string for materializing an rvalue of that arg for passing into + a lazy Node constructor. + """ + + # TODO: Matching on CType seems wrong; should be matching on Type + if isValueType(arg.lazy_type): + if isinstance(arg.lazy_type, BaseCType): + if arg.is_wrapped_scalar: + return f"node_{arg.name}" + elif arg.lazy_type.type is tensorListValueT: + return f"lazy_{arg.name}_tensorlist" + elif arg.is_symint_or_list: + return f"GetSymIntValue({arg.name})" + return f"lazy_{arg.name}->GetIrValue()" + elif isinstance(arg.lazy_type, OptionalCType): + if arg.is_symint_or_list: + # TODO: I don't understand when you should put lazy_ in the name + # or not + return f"{arg.name} ? std::make_optional(GetSymIntValue(*{arg.name})) : ::std::nullopt" + elif arg.is_wrapped_scalar: + return f"node_{arg.name}" + return ( + f"lazy_{arg.name} ? " + f"std::make_optional(lazy_{arg.name}->GetIrValue()) : " + "::std::nullopt" + ) + else: + raise AssertionError( + f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})" + ) + else: + # NB: this is here because right now we aren't treating SymInt[] as a + # value type; when we do this needs to move above + # NB: we cannot test arg.lazy_type as we've already specified it is an + # int64_t and so we cannot distinguish between SymInt and int64_t + if isinstance(arg.orig_type, ListType) and arg.orig_type.elem == BaseType( + BaseTy.SymInt + ): + if arg.symint: + return f"GetSymIntArrayRefValue({arg.name})" + else: + return f"std::vector({arg.name}.begin(), {arg.name}.end())" + elif isinstance(arg.lazy_type, VectorCType) and isinstance( + arg.lazy_type.elem, BaseCType + ): + return f"std::vector<{arg.lazy_type.elem.type}>({arg.name}.begin(), {arg.name}.end())" + elif ( + isinstance(arg.lazy_type, OptionalCType) + and isinstance(arg.lazy_type.elem, VectorCType) + and isinstance(arg.lazy_type.elem.elem, BaseCType) + ): + return f"torch::lazy::ToOptionalVector<{arg.lazy_type.elem.elem.type}>({arg.name})" + else: + return f"{arg.name}" + + +def node_ctor_inputs(schema: LazyIrSchema) -> str: + """ + Produce a formatted string with the arguments as passed into the constructor of a node class. + """ + node_ctor_values = [ + node_ctor_arg_rvalue_string(arg) for arg in schema.filtered_args() + ] + return ", ".join(node_ctor_values) + + +def gen_fallback_code( + schema: LazyIrSchema, + sig: DispatcherSignature | NativeSignature, + overload_name: str, +) -> str: + """ + Generate code that falls back to eager conditioned on a predicate + """ + dispatcher_sig = DispatcherSignature.from_schema(schema.func) + exprs = translate(sig.arguments(), dispatcher_sig.arguments()) + fallback_args = ",\n ".join([a.expr for a in exprs]) + if len(overload_name): + aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})" + else: + aten_op_str = f"ATEN_OP({schema.aten_name})" + return f""" + if (force_eager_fallback({aten_symbol(schema)})) {{ + return at::native::call_fallback_fn_symint<<c_eager_fallback, {aten_op_str}>::call( + {fallback_args} + ); + }} +""" + + +def aten_symbol(schema: LazyIrSchema) -> str: + missing_interned_strings = { + "sigmoid_backward", + } + if schema.aten_name in missing_interned_strings: + return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")' + + if not schema.aten_name.startswith("at::"): + return f"at::aten::{schema.aten_name}" + else: + return schema.aten_name + + +# converts all tensor-like arguments to meta tensors. Returns: +# (1) a string containing all of the logic that does the conversions. +# (2) a context, to be used by translate(), with all of the relevant bindings. +def convert_to_meta_tensors(sig: DispatcherSignature) -> tuple[str, list[Binding]]: + context: list[Binding] = [] + unwrapped_tensor_args: list[str] = [] + for arg in sig.arguments(): + if isinstance(arg.argument, Argument) and arg.argument.type.is_tensor_like(): + unwrapped_name = f"{arg.name}_meta" + unwrapped_tensor_args.append( + f"auto {unwrapped_name} = to_meta({arg.name});" + ) + context.append(arg.with_name(unwrapped_name)) + else: + context.append(arg) + unwrap_tensor_args_str = "\n ".join(unwrapped_tensor_args) + return unwrap_tensor_args_str, context + + +@dataclass(frozen=True) +class GenLazyIR(ABC): + backend_index: BackendIndex + backend_name: str + node_base: str + use_lazy_shape: bool + + @method_with_native_function + def __call__(self, f: NativeFunctionsGroup | NativeFunction) -> list[str]: + func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func + metadata = self.backend_index.get_kernel( + f.functional if isinstance(f, NativeFunctionsGroup) else f + ) + schema = LazyIrSchema( + func, symint=metadata is not None and metadata.supports_symint() + ) + return self.gen(schema) + + # there is no lowering functionality generated unless this IR base class is subclassed and + # implemented as a backend-specific node + def lowering_function(self, schema: LazyIrSchema) -> str: + return "" + + def create_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str: + return "" + + def can_be_reused_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str: + return f"""bool CanBeReused({node_ctor_args}) const {{ + return false; + }}""" + + def node_base_ctor_call(self, schema: LazyIrSchema) -> str: + value_args = schema.filtered_args(values=True, scalars=False) + # backends can customize the way the node base class constructor is called, + # as long as all of its arguments can be generated from information available from the schema + base_ctor_value_args_list = [] + for arg in value_args: + if isinstance(arg.lazy_type, (BaseCType, VectorCType)): + base_ctor_value_args_list.append(f"{arg.name}") + elif isinstance(arg.lazy_type, OptionalCType): + base_ctor_value_args_list.append(f"{arg.name}.value_or(kNullValue)") + else: + raise AssertionError( + f"Unsupported type ({arg.lazy_type}) - add support if necessary" + ) + base_ctor_value_args = ", ".join(base_ctor_value_args_list) + + scalar_args = schema.filtered_args(values=False, scalars=True) + + # Shape construction. + # Conditionally build shape depending on specified shape property + if schema.properties.ShapePrecompute: + shape_ctor_arg = "std::move(shapes)," + elif schema.properties.ShapeCompute: + shape_args = [a.name for a in value_args] + shape_args.extend(a.name for a in scalar_args) + shape_ctor_arg = f"compute_shape_{schema.name}({', '.join(shape_args)})," + elif schema.properties.ShapeCache: + shape_args = [f"operand({i})" for i in range(len(value_args))] + shape_args.extend(a.name for a in scalar_args) + shape_ctor_arg = f"[&](){{ return compute_shape_{schema.name}({', '.join(shape_args)})[0]; }}," + else: + shape_ctor_arg = "" + + scalar_hashes = ", ".join(f"{a.name}" for a in scalar_args) + + return f"""{self.node_base}( + {schema.node_name}::ClassOpKind(), + OpList{{{base_ctor_value_args}}}, + {shape_ctor_arg} + /* num_outputs */ {len(schema.returns)}, + torch::lazy::MHash({scalar_hashes}))""" + + def gen(self, schema: LazyIrSchema) -> list[str]: + opkind = schema.opkind or aten_symbol(schema) + + # for now, we just want one IR class decl and soon after also the method defs + # and we use the functional version not out/inplace. + all_args = schema.filtered_args() + scalar_args = schema.filtered_args(values=False, scalars=True) + + ctor_args = [f"const {i.lazy_type.cpp_type()}& {i.name}" for i in all_args] + reuse_ctor_args = ", ".join(ctor_args) + if self.use_lazy_shape and schema.properties.ShapePrecompute: + ctor_args.append("std::vector&& shapes") + node_ctor_args = ", ".join(ctor_args) + + scalar_initializers = ",\n ".join( + [ + # This code is just special casing the mapping from string_view -> strings + f"{a.name}({a.name}.has_value() ? ::std::make_optional(std::string(*{a.name})) : ::std::nullopt)" + if a.lazy_type.cpp_type() == "::std::optional" + else f"{a.name}({a.name})" + for a in scalar_args + ] + ) + if len(scalar_initializers): + scalar_initializers = f",\n {scalar_initializers}" + scalar_decls = "\n ".join( + [ + f"std::string {a.name};" + if a.lazy_type.cpp_type() == "c10::string_view" + else f"::std::optional {a.name};" + if a.lazy_type.cpp_type() == "::std::optional" + else f"{a.lazy_type.cpp_type()} {a.name};" + for a in scalar_args + ] + ) + optional_values = [ + arg.name + for arg in schema.filtered_args(values=True, scalars=False) + if isinstance(arg.lazy_type, OptionalCType) + ] + has_optional_decls = "\n ".join( + [f"bool has_{value}: 1;" for value in optional_values] + ) + has_optional_defs = "\n ".join( + [f"has_{value} = !!{value};" for value in optional_values] + ) + members_to_string = [] + for arg in scalar_args: + if isinstance(arg.lazy_type, OptionalCType): + value = f"{arg.name}.value()" + if arg.is_generator: + value = '"torch.Generator()"' + members_to_string.append( + f"""if ({arg.name}.has_value()) {{ + ss << ", {arg.name}=" << {value}; + }} else {{ + ss << ", {arg.name}=null"; + }}""" + ) + else: + members_to_string.append(f'ss << ", {arg.name}=" << {arg.name};') + members_to_string_str = "\n ".join(members_to_string) + + return [ + f"""\ +class {schema.node_name} : public {self.node_base} {{ + public: + static torch::lazy::OpKind ClassOpKind() {{ + return torch::lazy::OpKind({opkind}); + }} + + {schema.node_name}({node_ctor_args}) + : {self.node_base_ctor_call(schema)}{scalar_initializers} + {{ + {has_optional_defs} + }} + + std::string ToString() const override {{ + std::stringstream ss; + ss << {self.node_base}::ToString(); + {members_to_string_str} + return ss.str(); + }} + + {self.create_function(schema, reuse_ctor_args)} + + {self.can_be_reused_function(schema, reuse_ctor_args)} + + {self.lowering_function(schema)} + + {scalar_decls} + {has_optional_decls} + +}}; + +""", + ] + + +@dataclass(frozen=True) +class GenTSLazyIR(GenLazyIR): + def lowering_function(self, schema: LazyIrSchema) -> str: + signature = """ + torch::lazy::TSOpVector Lower( + std::shared_ptr function, + torch::lazy::TSLoweringContext* loctx) const override""" + + if schema.properties.LowerDeclOnly: + return f"{signature};" + elif schema.properties.Lower: + return f"""{signature} {{ + {ts_lowering_body(schema)} + }} + """ + else: + return "" + + def create_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str: + signature = f"static NodePtr Create({node_ctor_args})" + if schema.properties.CreateFnDeclOnly: + return f"{signature};" + elif not schema.properties.CreateFn: + return "" + return f"""{signature} {{ + return ReuseOrMakeNode<{schema.node_name}>(data); + }}""" + + def can_be_reused_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str: + signature = f"bool CanBeReused({node_ctor_args}) const" + if schema.properties.CanBeReusedDeclOnly: + return f"{signature};" + elif not schema.properties.CanBeReused: + return "" + value_comparison = [] + for arg in itertools.chain(schema.positional_values, schema.keyword_values): + if isinstance(arg.lazy_type, OptionalCType): + value_comparison.append( + f"nullable_operand(i++) == {arg.name}.value_or(kNullValue)" + ) + else: + value_comparison.append(f"operand(i++) == {arg.name}") + for arg in itertools.chain(schema.positional_scalars, schema.keyword_scalars): + if isinstance(arg.lazy_type, OptionalCType): + value_comparison.append( + f"((!this->{arg.name}&&!{arg.name}) || (this->{arg.name}&&{arg.name} && *(this->{arg.name}) == *{arg.name}))" + ) + else: + value_comparison.append(f"this->{arg.name} == {arg.name}") + value_comparison_str = " &&\n ".join(value_comparison) + + return f"""{signature} {{ + size_t i = 0; + return ({value_comparison_str}); + }}""" + + +@dataclass(frozen=True) +class GenLazyNativeFuncDefinition: + class_method_name: str + backend_index: BackendIndex + tensor_class: str + gen_forced_fallback_code: bool + backend_namespace: str + get_tensorlist: str + get_tensor_or_wrap_number: str + try_get_tensor: str + metrics_counter: str + create_tensor: str + create_from_first_tensor: bool + create_aten_from_ltc_tensor: str + tuple_aten_from_ltc_tensors: str + lazy_tensor_ptr: str + get_device_fn: str + + def lazy_tensor_decls(self, func: NativeFunction, schema: LazyIrSchema) -> str: + value_args = schema.filtered_args(values=True, scalars=False) + # Generates lazy_{name} variables for LazyTensors wrapping input tensors + lazy_tensor_decls: list[str] = [] + for arg in value_args: + if arg.is_wrapped_scalar: + if isinstance(arg.lazy_type, OptionalCType): + lazy_tensor_decls.append( + f"""auto node_{arg.name} = {arg.name} ? + std::make_optional(torch::lazy::LazyGraphExecutor::Get()-> + GetIrValueForScalarFromCodegen(*{arg.name}, *common_device)): + ::std::nullopt;""" + ) + else: + lazy_tensor_decls.append( + f"""auto node_{arg.name} = torch::lazy::LazyGraphExecutor::Get()-> + GetIrValueForScalarFromCodegen({arg.name}, *common_device);""" + ) + elif arg.is_symint_or_list: + continue # values are extracted in isValueType + elif isinstance(arg.lazy_type, BaseCType): + if arg.lazy_type.type is tensorListValueT: + lazy_tensor_decls.append( + f"auto lazy_{arg.name}_tensorlist = " + f"{self.backend_namespace}::{self.get_tensorlist}({arg.name});" + ) + else: + lazy_tensor_decls.append( + f"{self.lazy_tensor_ptr} lazy_{arg.name} = " + f"{self.backend_namespace}::{self.get_tensor_or_wrap_number}({arg.name}, *common_device);" + ) + elif isinstance(arg.lazy_type, OptionalCType): + assert arg.lazy_type.elem == BaseCType(getValueT()), arg.lazy_type.elem + # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it + # until we encounter a real world example. + lazy_tensor_decls.append( + f"{self.lazy_tensor_ptr} lazy_{arg.name} = " + f"{self.backend_namespace}::{self.try_get_tensor}({arg.name}.value_or(at::Tensor()));" + ) + else: + raise AssertionError( + f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})" + ) + return ("\n ").join(lazy_tensor_decls) + + def force_eager_fallback( + self, + func: NativeFunction, + schema: LazyIrSchema, + metadata: BackendMetadata, + sig: DispatcherSignature | NativeSignature, + ) -> str: + if self.gen_forced_fallback_code: + return gen_fallback_code( + schema, sig, overload_name=func.func.name.overload_name + ) + return "" + + def metrics(self, func: NativeFunction, schema: LazyIrSchema) -> str: + return f"{self.metrics_counter};" + + def get_device(self, func: NativeFunction, schema: LazyIrSchema) -> str: + value_args = schema.filtered_args(values=True, scalars=False) + scalar_args = schema.filtered_args(values=False, scalars=True) + value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar] + optional_device = OptionalCType(BaseCType(deviceT)) + optional_devices = [ + a.name for a in scalar_args if a.lazy_type == optional_device + ] + assert ( + len(value_types_names) > 0 or len(optional_devices) > 0 + ), "Expected at least one Value or Device type" + get_device_str = ( + f"{self.get_device_fn}({', '.join(value_types_names + optional_devices)})" + ) + return f"""auto common_device = {get_device_str}; + TORCH_INTERNAL_ASSERT(common_device); + """ + + def shape_inference(self, func: NativeFunction, schema: LazyIrSchema) -> str: + metadata = self.backend_index.get_kernel(func) + assert metadata is not None + all_args = schema.filtered_args() + returns_length = len(schema.returns) + # call the meta kernel if it exists, to compute output shape/dtype for our IR + # Note [Generated LTC Shape Functions] + # LTC uses meta tensors from core to do shape inference when possible, and otherwise + # we generate a shape function declaration that needs to be manually implemented. + # How do we detect which ops are eligible to use meta tensors? + # In general we should be able to use meta tensors not just on structured operators, + # but also on composite operators that are implemented in terms of structured kernels. + # We don't currently have a way of knowing at codegen time which ops are implemented that way. + # This is the case for all view and view_copy operators however, so we're going to + # use them specifically for all of the view_copy ops (instead of manually writing shape rules for all of them). + is_view_copy_op = "view_copy" in func.tags + is_structured = func.structured or func.structured_delegate is not None + if is_structured or is_view_copy_op: + meta_out = """ +std::vector shapes{torch::lazy::Shape(out_meta.scalar_type(), out_meta.sizes().vec())};""" + if returns_length > 1: + + def this_shape(i: int) -> str: + return f"torch::lazy::Shape(std::get<{i}>(out_meta).scalar_type(), std::get<{i}>(out_meta).sizes().vec())" + + shapes_str = ",".join([this_shape(i) for i in range(returns_length)]) + meta_out = "std::vector shapes{" + shapes_str + "};" + + # Convert tensor args to the meta device and call it. + # (We can't pass in the input tensors directly, because they are "functional wrappers". + # If any of the meta kernels call a tensor op and redispatch, we don't want to hit the functionalize kernels.) + # Even at::meta:: functions might redispatch, e.g. if they call into view ops. + dispatcher_sig = DispatcherSignature.from_schema(func.func) + meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig) + meta_call_args = [ + e.expr + for e in translate( + meta_call_ctx, dispatcher_sig.arguments(), method=False + ) + ] + if is_view_copy_op: + # view_copy ops always have a CompositeExplicitAutogradNonFunctional kernel + assert func.has_composite_explicit_autograd_non_functional_kernel + dispatch_ns = "compositeexplicitautogradnonfunctional" + else: + dispatch_ns = "meta" + aten_name = schema.aten_name + # TODO: this is trolling + if func.func.has_symint() and metadata.supports_symint(): + aten_name += "_symint" + shape_str = f"""\ + {meta_conversion_str} + auto out_meta = at::{dispatch_ns}::{aten_name}({', '.join(meta_call_args)}); + {meta_out}""" + else: + shape_sig = ComputeShapeSignature( + metadata.kernel, func, symint=metadata.supports_symint() + ) + shape_str = f""" + auto shapes = {shape_sig.shape_call};""" + + shape_str += f""" + TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});""" + + # Calculating which dimensions are symbolic + func_schema_str = "aten::" + str(func.func) + shape_str += f""" + if(torch::lazy::symbolicShapeEnabled()){{ + std::vector inputs = {{ {', '.join(str(a.name) for a in all_args)} }}; + const char* schema_str = "{func_schema_str}"; + applySymbolicShapesOnLT(schema_str, inputs, shapes); + }} + """ + return shape_str + + def build_ir_node(self, func: NativeFunction, schema: LazyIrSchema) -> str: + node_ctor_input_str = node_ctor_inputs(schema) + return f"""torch::lazy::NodePtr node = torch::lazy::ReuseNode<{schema.node_name}>({node_ctor_input_str}); + if (!node) {{ + {self.shape_inference(func, schema)} + node = torch::lazy::MakeNode<{schema.node_name}>({node_ctor_input_str}, std::move(shapes)); + CacheNode(node); + }} + """ + + def create_lazy_tensor(self, first_tensor_name: str | None = None) -> str: + # xla uses an instance method for tensor creation, for the time being + if self.create_from_first_tensor: + # TODO(whc) remove this if XLA switches to using static method for creation + assert ( + first_tensor_name is not None + ), "Requires first tensor to create lazy tensor" + return f"{first_tensor_name}.{self.create_tensor}" + return f"{self.backend_namespace}::{self.create_tensor}" + + def return_aten_tensor(self, func: NativeFunction, schema: LazyIrSchema) -> str: + returns_length = len(schema.returns) + value_args = schema.filtered_args(values=True, scalars=False) + value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar] + first_tensor_name = value_types_names[0] if len(value_types_names) > 0 else None + bridge_str = f"""auto result = {self.create_aten_from_ltc_tensor}( + {self.create_lazy_tensor(first_tensor_name)}(std::move(node), *common_device));""" + + if returns_length > 1: + assert ( + len(value_types_names) > 0 + ), "Code below assumes there is at least one tensor arg" + bridge_str = f"""std::vector<{self.lazy_tensor_ptr}> lazy_tensors; + for (int i = 0; i < {returns_length}; i++) {{ + lazy_tensors.push_back({self.create_lazy_tensor(first_tensor_name)}({getValueT()}(node, i), *common_device)); + }} + auto result = {self.tuple_aten_from_ltc_tensors}<{returns_length}>(lazy_tensors);""" + + if schema.name.name.inplace or func.func.is_out_fn(): + assert returns_length == 1, ( + "We assumed there was no such case where an op is an in-place variant " + f"and has tuple outputs, but got tuple of len {returns_length}." + ) + bridge_str = f"""lazy_{first_tensor_name}->SetInPlaceIrValue(node); + auto& result = {first_tensor_name};""" + + bridge_str += """ + return result;""" + return bridge_str + + @method_with_native_function + def __call__(self, func: NativeFunction) -> list[str]: + sig = kernel_signature(func, self.backend_index) + metadata = self.backend_index.get_kernel(func) + assert metadata is not None + schema = LazyIrSchema(func.func, symint=metadata.supports_symint()) + return [ + f"""\ + {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{ + {self.force_eager_fallback(func, schema, metadata, sig)} + {self.metrics(func, schema)} + {self.get_device(func, schema)} + {self.lazy_tensor_decls(func, schema)} + {self.build_ir_node(func, schema)} + {self.return_aten_tensor(func, schema)} + }}\n + """ + ] + + +class ComputeShapeSignature: + """ + Here we use the base name as the suffix of the signature to avoid generating for in-place variants. + """ + + def __init__(self, kernel_name: str, f: NativeFunction, *, symint: bool) -> None: + self.__schema = LazyIrSchema(f.func, symint=symint) + self.__dispatch_args = ", ".join( + [a.decl() for a in dispatcher.arguments(f.func, symint=symint)] + ) + self.__call_args = ", ".join( + [f"{arg.name}" for arg in self.__schema.filtered_args(generator=True)] + ) + self.__kernel_name = kernel_name + + def __decl_suffix(self) -> str: + return f"{self.__kernel_name}({self.__dispatch_args})" + + def __call_suffix(self) -> str: + return f"{self.__kernel_name}({self.__call_args})" + + @property + def shape_decl(self) -> str: + return f"TORCH_API std::vector compute_shape_{self.__decl_suffix()}" + + @property + def shape_call(self) -> str: + return f"torch::lazy::compute_shape_{self.__call_suffix()}" + + +@dataclass(frozen=True) +class GenLazyShapeInferenceDefinition: + backend_index: BackendIndex + tensor_class: str + + @method_with_native_function + def __call__(self, f: NativeFunction) -> list[str]: + metadata = self.backend_index.get_kernel(f) + assert metadata is not None + + # See Note [Generated LTC Shape Functions] + is_view_copy_op = "view_copy" in f.tags + is_structured = f.structured or f.structured_delegate is not None + if is_structured or is_view_copy_op: + return [] + else: + shape_sig = ComputeShapeSignature( + metadata.kernel, f, symint=metadata.supports_symint() + ) + return ["\n".join([f"{shape_sig.shape_decl};"])] + + +def generate_non_native_lazy_ir_nodes( + non_native: list[dict[str, Any]], gen_lazy_ir: GenLazyIR +) -> list[str]: + """Generate the non-native lazy IR node classes""" + nodes = [] + for op in non_native: + # Set default properties for Non-Native IRs + properties = LazyIrProperties("ShapeCache", "CanBeReused", "LowerDeclOnly") + for p in op.get("properties", []): + setattr(properties, p, True) + + # non-native is assumed to want symint bindings if you wrote symint + schema = LazyIrSchema(FunctionSchema.parse(op["func"]), properties, symint=True) + schema.opkind = op.get("opkind") + nodes.append(gen_lazy_ir.gen(schema)[0]) + + return nodes diff --git a/torchgen/dest/lazy_ts_lowering.py b/torchgen/dest/lazy_ts_lowering.py new file mode 100644 index 00000000000..70161216d8e --- /dev/null +++ b/torchgen/dest/lazy_ts_lowering.py @@ -0,0 +1,48 @@ +from torchgen.api.lazy import LazyArgument, LazyIrSchema +from torchgen.api.types import OptionalCType + + +def ts_lowering_body(schema: LazyIrSchema) -> str: + # for now, we just want one IR class decl and soon after also the method defs + # and we use the functional version not out/inplace. + emplace_arguments = [] + + def get_value(arg: LazyArgument) -> str: + if isinstance(arg.lazy_type, OptionalCType): + return f"has_{arg.name} ? loctx->GetOutputOp(operand(i++)) : nullptr" + return "loctx->GetOutputOp(operand(i++))" + + for arg in schema.positional_args: + if arg.is_lazy_value: + emplace_arguments.append(get_value(arg)) + continue + emplace_arguments.append(f'"{arg.name}", {arg.name}') + + emplace_arguments_str = "\n ".join( + [f"arguments.emplace_back({a});" for a in emplace_arguments] + ) + emplace_kwarg_values = [ + f'"{arg.name}", {get_value(arg)}' for arg in schema.keyword_values + ] + emplace_kwarg_scalars = [ + f'"{arg.name}", {arg.name}' for arg in schema.keyword_scalars + ] + emplace_kwarguments = "\n ".join( + [ + f"kwarguments.emplace_back({a});" + for a in emplace_kwarg_values + emplace_kwarg_scalars + ] + ) + return f"""\ + std::vector arguments; + std::vector kwarguments; + arguments.reserve({len(emplace_arguments)}); + kwarguments.reserve({len(emplace_kwarg_values + emplace_kwarg_scalars)}); + size_t i = 0; + {emplace_arguments_str} + {emplace_kwarguments} + torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments); + TORCH_CHECK_EQ({schema.aten_name}_out.size(), {len(schema.returns)}); + + return {schema.aten_name}_out; +""" diff --git a/torchgen/dest/native_functions.py b/torchgen/dest/native_functions.py new file mode 100644 index 00000000000..a93405555bc --- /dev/null +++ b/torchgen/dest/native_functions.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import torchgen.api.meta as meta +import torchgen.api.structured as structured +from torchgen.api.types import kernel_signature +from torchgen.context import with_native_function_and_index +from torchgen.model import BackendIndex, NativeFunction, NativeFunctionsGroup +from torchgen.utils import mapMaybe + + +@with_native_function_and_index +def gen_unstructured(f: NativeFunction, backend_index: BackendIndex) -> str | None: + sig = kernel_signature(f, backend_index) + metadata = backend_index.get_kernel(f) + if metadata is None: + return None + if "legacy::" in metadata.kernel: + return None + else: + prefix = "static" if backend_index.external else "TORCH_API" + return f"{prefix} {sig.decl(name=metadata.kernel)};" + + +@with_native_function_and_index +def gen_structured(g: NativeFunctionsGroup, backend_index: BackendIndex) -> list[str]: + meta_name = meta.name(g) + out_args = structured.impl_arguments(g) + metadata = backend_index.get_kernel(g) + if metadata is None: + return [] + prefix = "" if backend_index.external else "TORCH_API " + return [ + f"""\ +struct {prefix}structured_{metadata.kernel} : public at::meta::structured_{meta_name} {{ +void impl({', '.join(a.decl() for a in out_args)}); +}}; +""" + ] + + +# Generates NativeFunctions.h, a list of forward declarations of all +# actual kernel definitions we keep in aten/src/ATen/native/ +@with_native_function_and_index +def compute_native_function_declaration( + g: NativeFunctionsGroup | NativeFunction, backend_index: BackendIndex +) -> list[str]: + metadata = backend_index.get_kernel(g) + if isinstance(g, NativeFunctionsGroup): + if metadata is not None and metadata.structured: + if backend_index.external: + # Structured hasn't been tested with external backends yet. + raise AssertionError( + "Structured external backend functions are not implemented yet." + ) + else: + return gen_structured(g, backend_index) + else: + return list( + mapMaybe(lambda f: gen_unstructured(f, backend_index), g.functions()) + ) + else: + x = gen_unstructured(g, backend_index) + return [] if x is None else [x] diff --git a/torchgen/dest/register_dispatch_key.py b/torchgen/dest/register_dispatch_key.py new file mode 100644 index 00000000000..cb7dc00a60b --- /dev/null +++ b/torchgen/dest/register_dispatch_key.py @@ -0,0 +1,1003 @@ +from __future__ import annotations + +import itertools +import textwrap +from dataclasses import dataclass +from typing import Literal, TYPE_CHECKING + +import torchgen.api.cpp as cpp +import torchgen.api.meta as meta +import torchgen.api.structured as structured +from torchgen.api.translate import translate +from torchgen.api.types import ( + BaseCType, + Binding, + ConstRefCType, + CppSignature, + CppSignatureGroup, + DispatcherSignature, + Expr, + kernel_signature, + MutRefCType, + NamedCType, + NativeSignature, + tensorT, +) +from torchgen.context import method_with_native_function, native_function_manager +from torchgen.model import ( + Argument, + BackendIndex, + DeviceCheckType, + DispatchKey, + gets_generated_out_inplace_wrapper, + is_cuda_dispatch_key, + NativeFunction, + NativeFunctionsGroup, + SchemaKind, + TensorOptionsArguments, +) +from torchgen.utils import assert_never, mapMaybe, Target + + +if TYPE_CHECKING: + from torchgen.selective_build.selector import SelectiveBuilder + + +def gen_registration_headers( + backend_index: BackendIndex, + per_operator_headers: bool, + rocm: bool, +) -> list[str]: + if per_operator_headers: + headers = ["#include "] + else: + headers = ["#include "] + + if backend_index.dispatch_key in (DispatchKey.CPU, DispatchKey.Meta): + headers.append("#include ") + elif backend_index.dispatch_key == DispatchKey.CUDA: + if rocm: + headers.append("#include ") + else: + headers.append("#include ") + elif backend_index.dispatch_key == DispatchKey.MPS: + headers.append("#include ") + elif backend_index.dispatch_key == DispatchKey.XPU: + # XPU specific, this header resides in third_party/torch-xpu-ops + headers.append("#include ") + elif per_operator_headers: + headers += [ + "#include ", + "#include ", + "#include ", + "#include ", + ] + else: + headers.append("#include ") + + headers.append("#include ") + return headers + + +def gen_empty_impl_names( + backend_index: BackendIndex, +) -> tuple[str | None, str | None]: + empty_impl = None + empty_strided_impl = None + + if backend_index.dispatch_key in ( + DispatchKey.Meta, + DispatchKey.CPU, + DispatchKey.CUDA, + DispatchKey.MPS, + DispatchKey.XPU, + ): + dispatch = str(backend_index.dispatch_key).lower() + empty_impl = f"at::detail::empty_{dispatch}" + empty_strided_impl = f"at::detail::empty_strided_{dispatch}" + elif backend_index.dispatch_key in ( + DispatchKey.CompositeExplicitAutogradNonFunctional, + DispatchKey.QuantizedCPU, + DispatchKey.QuantizedCUDA, + DispatchKey.XPU, + ): + empty_impl = "at::empty" + empty_strided_impl = "at::empty_strided" + + return empty_impl, empty_strided_impl + + +def gen_create_out_helper(backend_index: BackendIndex) -> list[str]: + if backend_index.dispatch_key == DispatchKey.Meta: + empty_options = "options.device(at::kMeta)" + else: + empty_options = "options" + + empty_impl, empty_strided_impl = gen_empty_impl_names(backend_index) + if empty_impl is None: + return [] + + return [ + f""" +Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{ + if (strides.empty()) {{ + return {empty_impl}(sizes, {empty_options}); + }} else {{ + return {empty_strided_impl}(sizes, strides, {empty_options}); + }} +}} +""" + ] + + +def gen_maybe_create_proxy_helper(backend_index: BackendIndex) -> list[str]: + _, empty_strided_impl = gen_empty_impl_names(backend_index) + return ( + [] + if empty_strided_impl is None + else [ + f""" +std::optional maybe_create_proxy(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{ + if (out.strides() != strides) {{ + return {empty_strided_impl}(sizes, strides, options); + }} + return std::nullopt; +}} +""" + ] + ) + + +def gen_resize_out_helper(backend_index: BackendIndex) -> list[str]: + if backend_index.dispatch_key == DispatchKey.CompositeExplicitAutogradNonFunctional: + # The function isn't used by this key (since only functional ops have a kernel for this key), + # so we need to not include it to avoid a defined-but-not-used error. + return [] + return [ + """ +void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) { + TORCH_CHECK(options.dtype() == out.dtype(), + "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead"); + TORCH_CHECK(options.device() == out.device(), + "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead"); + const bool resized = at::native::resize_output(out, sizes); + // Only restride if a resize occurred; otherwise we ignore the (advisory) + // strides from the meta function and directly use the output tensor's + // preexisting strides + if (resized) { + if (!strides.empty()) { + TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); + // TODO: avoid the redispatch here + out.as_strided_(sizes, strides); + } else if (options.memory_format_opt().has_value()) { + out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); + } + } +} +""" + ] + + +def gen_check_inplace_helper(backend_index: BackendIndex) -> list[str]: + return [ + """ +void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) { + // These checks are needed on those operators that: + // 1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm') + // 2) have particular typing rules (e.g. 'cumsum' and 'cumprod') + // For other operators (e.g. 'add'), 'TensorIterator' already checks + // these things separately. + TORCH_CHECK(options.dtype() == self.dtype(), + "Bad in-place call: ", + "input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match"); + TORCH_CHECK(options.device() == self.device(), + "Bad in-place call: ", + "input tensor device ", self.device(), " and output tensor device ", options.device(), " should match"); + TORCH_CHECK(sizes == self.sizes(), + "Bad in-place call: ", + "input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match"); +} +""" + ] + + +def gen_registration_helpers(backend_index: BackendIndex) -> list[str]: + return [ + 'C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")', + *gen_create_out_helper(backend_index), + *gen_resize_out_helper(backend_index), + *gen_check_inplace_helper(backend_index), + *gen_maybe_create_proxy_helper(backend_index), + "C10_DIAGNOSTIC_POP()", + ] + + +# Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp). +# +# - The primary function of this file is to register all of the +# implementations for the given dispatch key to the dispatcher, +# so they are available for use in PyTorch. If dispatch is +# None, we generate schema (def) registrations and catchall +# registrations. +# - The secondary function of this file is to generate a wrapper +# around functions. In CPUType these wrappers do nothing +# (and should be removed), but in other cases they handle +# DeviceGuard. A small extra benefit of wrappers is they +# are not overloaded, so they can be used in the registration +# API without having to disambiguate which overload you want +# (as would be the case if you directly registered native:: +# functions). +# - The tertiary function of this file is to generate *static* +# cpp API bindings which can be used to bypass dispatcher +# directly to kernels, but with user-friendly cpp-style API +@dataclass(frozen=True) +class RegisterDispatchKey: + backend_index: BackendIndex + + target: Literal[ + Target.ANONYMOUS_DEFINITION, + Target.NAMESPACED_DEFINITION, + Target.NAMESPACED_DECLARATION, + Target.REGISTRATION, + ] + + # Selector object to determine which operators to generate + # registration code for. + selector: SelectiveBuilder + + # Whether or not we are actually code-genning for ROCm + rocm: bool + + # Whether or not to generate symint registrations or not. External users + # of codegen who don't care about symints can set this to false to get + # non-SymInt codegen + symint: bool + + # The class that all unstructured native functions live under. This is used to improve + # compiler error messages when a kernel writer adds a native function with the wrong signature. + # This is only used in unstructured kernels, since structured kernels already live in a class. + # Finally, this field is currently Optional because it is only used by external backends. + # It would be nice if we can add the same logic to in-tree kernels too, but that requires updating + # all of the existing kernel signatures scattered across aten/src/ATen/native. + class_method_name: str | None + + # Only set to true in lightweight dispatch. If lightweight dispatch is enabled we are registering + # operators into JIT op registry, thus we need to avoid generating code to register into the dispatcher. + skip_dispatcher_op_registration: bool + + @staticmethod + def gen_device_check( + type: DeviceCheckType, args: list[Argument], method_name: str + ) -> str: + if type == DeviceCheckType.NoCheck: + return " // No device check\n" + + device_check = "std::optional common_device = std::nullopt;\n" + device_check += "(void)common_device; // Suppress unused variable warning\n" + for arg in args: + # Only tensor like arguments are eligible + if arg.type.is_tensor_like(): + device_check += f""" + c10::impl::check_and_update_common_device(common_device, {arg.name}, "{method_name}", "{arg.name}");""" + return device_check + + @method_with_native_function + def __call__(self, f: NativeFunctionsGroup | NativeFunction) -> list[str]: + if isinstance(f, NativeFunctionsGroup): + g: NativeFunctionsGroup = f + # Note: We call gen_structured() if the operator is marked structured, regardless of the backend. + # gen_structured() has special logic to handle auto-generated kernels. + if g.structured: + return self.gen_structured(g) + else: + return list( + mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions()) + ) + elif isinstance(f, NativeFunction): + r = self.gen_unstructured(f) + return [] if r is None else [r] + else: + assert_never(f) + + def wrapper_kernel_sig( + self, f: NativeFunction + ) -> NativeSignature | DispatcherSignature: + # The prefix is just to ensure uniqueness. The Dispatcher API doesn't guarantee unique kernel names. + return DispatcherSignature.from_schema( + f.func, + prefix=f"wrapper_{self.backend_index.dispatch_key}_{f.func.name.overload_name}_", + symint=self.symint, + ) + + def gen_out_inplace_wrapper( + self, f: NativeFunction, g: NativeFunctionsGroup | None + ) -> str | None: + if g is None: + return None + k = f.func.kind() + if k is SchemaKind.inplace: + copy_op = "at::_copy_from" + elif k is SchemaKind.out: + copy_op = "at::_copy_from_and_resize" + else: + raise AssertionError("gen_out_inplace_wrapper called on a functional op") + + sig = self.wrapper_kernel_sig(f) + name = sig.name() + + func_res = f"{name}_tmp" + return_names = cpp.return_names(f) + if len(return_names) > 1: + updates = "\n ".join( + f"{copy_op}(std::get<{i}>({func_res}), {ret_name});" + for i, ret_name in enumerate(return_names) + ) + returns = f'{sig.returns_type().cpp_type()}({", ".join(return_names)})' + elif len(return_names) == 1: + ret_name = return_names[0] + updates = f"{copy_op}({func_res}, {ret_name});" + returns = ret_name + else: + assert len(f.func.arguments.out) == 1 + returns = "" + out_arg = f.func.arguments.out[0] + if out_arg.type.is_list_like(): + updates = f"""\ + for (int64_t i = 0; i < {func_res}.size(); ++i) {{ + {copy_op}({func_res}[i], {out_arg.name}[i]); + }}""" + else: + updates = f"{copy_op}({func_res}, {out_arg.name});" + + functional_sig = self.wrapper_kernel_sig(g.functional) + wrapper_name = sig.name() + + return f"""\ +{sig.defn(name=wrapper_name)} {{ + auto {func_res} = {functional_sig.name()}({", ".join(e.expr for e in translate(sig.arguments(), functional_sig.arguments()))}); + {updates} + return {returns}; +}} +""" + + def gen_structured(self, g: NativeFunctionsGroup) -> list[str]: + metadata = self.backend_index.get_kernel(g) + if self.backend_index.dispatch_key == DispatchKey.Meta: + assert not self.backend_index.has_kernel(g.out), ( + "Do not explicitly specify Meta dispatch key on structured " + "functions, they will be automatically generated for you" + ) + elif ( + self.backend_index.dispatch_key + == DispatchKey.CompositeExplicitAutogradNonFunctional + ): + assert not self.backend_index.has_kernel(g.out), ( + "Do not explicitly specify CompositeExplicitAutograd dispatch key on structured " + "functions, they will be automatically generated for you" + ) + elif metadata is None or not metadata.structured: + return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions())) + structured_gen = StructuredRegisterDispatchKey( + self.backend_index, + self.target, + self.selector, + self.rocm, + self.symint, + self.class_method_name, + self.skip_dispatcher_op_registration, + g, + ) + return list(mapMaybe(structured_gen.gen_one, g.functions())) + + def gen_unstructured( + self, f: NativeFunction, g: NativeFunctionsGroup | None = None + ) -> str | None: + with native_function_manager(f): + inplace_meta = False + gets_out_inplace_wrapper = False + if not self.backend_index.has_kernel(f): + if ( + self.backend_index.dispatch_key == DispatchKey.Meta + and f.func.kind() is SchemaKind.inplace + and + # Defer to composites for meta implementation + not f.has_composite_kernel + and + # Inplace list operations are not supported + len(f.func.returns) == 1 + ): + inplace_meta = True + elif ( + not self.backend_index.use_out_as_primary + and g is not None + and gets_generated_out_inplace_wrapper(f, g, self.backend_index) + ): + # We want to generate inplace/out wrappers, that don't have a kernel for the backend. + gets_out_inplace_wrapper = True + else: + return None + if f.manual_kernel_registration: + return None + + if ( + self.target is Target.REGISTRATION + and not self.selector.is_native_function_selected(f) + ): + return None + + sig = self.wrapper_kernel_sig(f) + + name = sig.name() + returns_type = sig.returns_type().cpp_type() + args = sig.arguments() + args_str = ", ".join(a.defn() for a in args) + + # See Note [Direct dispatch bindings] + cpp_sig_group = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=False + ) + + # TODO: dedupe this with the structured codegen + if self.target is Target.NAMESPACED_DECLARATION: + result = "" + for cpp_sig in cpp_sig_group.signatures(symint=self.symint): + result += f"TORCH_API {cpp_sig.decl()};\n" + return result + elif self.target is Target.NAMESPACED_DEFINITION: + + def generate_defn(cpp_sig: CppSignature) -> str: + return f""" +{cpp_sig.defn()} {{ +return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))}); +}} +""" + + result = "" + for cpp_sig in cpp_sig_group.signatures(symint=self.symint): + result += generate_defn(cpp_sig) + return result + + elif self.target is Target.ANONYMOUS_DEFINITION: + # short circuit for inplace_meta + if inplace_meta: + assert f.func.arguments.self_arg is not None + self_arg_name = f.func.arguments.self_arg.argument.name + # TODO: handle in place on tensor list + return f""" +{returns_type} {name}({args_str}) {{ + TORCH_CHECK_NOT_IMPLEMENTED({self_arg_name}.is_meta(), + "Cannot inplace into non-meta tensor with meta tensor argument"); + return {self_arg_name}; +}} +""" + + # short circuit for generated inplace/out wrappers + if gets_out_inplace_wrapper: + return self.gen_out_inplace_wrapper(f, g) + + metadata = self.backend_index.get_kernel(f) + if metadata is None: + return None + if self.class_method_name is None: + impl_name = f"{metadata.cpp_namespace}::{metadata.kernel}" + else: + impl_name = f"{metadata.cpp_namespace}::{self.class_method_name}::{metadata.kernel}" + + kernel_sig = kernel_signature(f, self.backend_index) + + args_exprs_str = ", ".join( + e.expr + for e in translate( + sig.arguments(), kernel_sig.arguments(), method=False + ) + ) + + device_check = " // No device check\n" + # Backends that require device guards presumably also require device checks. + if self.backend_index.device_guard: + device_check_args = itertools.chain( + f.func.arguments.out, f.func.arguments.flat_positional + ) + device_check = RegisterDispatchKey.gen_device_check( + f.device_check, list(device_check_args), name + ) + + device_guard = "// DeviceGuard omitted" # default + if f.device_guard and self.backend_index.device_guard: + has_tensor_options = any( + isinstance(a, TensorOptionsArguments) + for a in f.func.arguments.non_out + ) + if has_tensor_options: + # kernel is creating a tensor + device_guard = """ + const DeviceGuard device_guard(device_or_default(device));""" + + # CUDA requires special handling + if is_cuda_dispatch_key(self.backend_index.dispatch_key): + device_guard = f"globalContext().lazyInitDevice(c10::DeviceType::CUDA);\n{device_guard}" + else: + # kernel is operating on existing tensors + + # There is precedence for which argument we use to do + # device guard. This describes the precedence order. + self_arg = ( + [f.func.arguments.self_arg.argument] + if f.func.arguments.self_arg is not None + else [] + ) + candidate_args = itertools.chain( + self_arg, + f.func.arguments.out, + f.func.arguments.flat_positional, + ) + + # Only tensor like arguments are eligible + device_of = next( + ( + f"{a.name}" + for a in candidate_args + if a.type.is_tensor_like() + ), + None, + ) + if device_of is not None: + device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));" + + return f"""\ +namespace {{ + +{returns_type} {name}({args_str}) {{ + {device_check} + + {device_guard} + return {impl_name}({args_exprs_str}); +}} + +}} // anonymous namespace +""" + + elif self.target is Target.REGISTRATION: + if f.manual_kernel_registration or self.skip_dispatcher_op_registration: + return None + else: + payload = f"TORCH_FN({name})" + return f'm.impl("{f.func.name}",\n{payload});\n' + else: + assert_never(self.target) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# STRUCTURED +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +@dataclass(frozen=True) +class StructuredRegisterDispatchKey(RegisterDispatchKey): + g: NativeFunctionsGroup + + def gen_class_set_output_functions( + self, k: SchemaKind, parent_class: str, generate_super: bool + ) -> str: + if generate_super: + set_output_super = f"{parent_class}::set_output_raw_strided(output_idx, sizes, strides, options, names);" + else: + set_output_super = "" + + def gen_set_output_function(name: str, maybe_create_proxy: bool) -> str: + return f""" +void set_output_{name}( + int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, + TensorOptions options, DimnameList names +) override {{ +{textwrap.indent(self.gen_class_set_output_body(k, maybe_create_proxy), " ")} + if (!names.empty()) {{ + namedinference::propagate_names(outputs_[output_idx], names); + }} + // super must happen after, so that downstream can use maybe_get_output + // to retrieve the output +{textwrap.indent(set_output_super, " ")} +}} +""" + + return f""" +{gen_set_output_function("strided", maybe_create_proxy=True)} +{gen_set_output_function("raw_strided", maybe_create_proxy=False)} +""" + + def gen_class_set_output_body(self, k: SchemaKind, maybe_create_proxy: bool) -> str: + if self.backend_index.dispatch_key in [ + DispatchKey.CUDA, + DispatchKey.MPS, + DispatchKey.CompositeExplicitAutogradNonFunctional, + ]: + maybe_set_guard = """ +auto current_device = guard_.current_device(); +if (C10_UNLIKELY(current_device.has_value())) { + TORCH_INTERNAL_ASSERT(*current_device == options.device(), + "structured kernels don't support multi-device outputs"); +} else { + guard_.reset_device(options.device()); +} +""" + maybe_set_guard_line = maybe_set_guard + "\n" + else: + maybe_set_guard_line = maybe_set_guard = "" + + if maybe_create_proxy: + create_proxy = """ +auto maybe_proxy = maybe_create_proxy(out, sizes, strides, options); +if (C10_UNLIKELY(maybe_proxy.has_value())) { + proxy_outputs_[output_idx] = std::move(maybe_proxy).value(); +} +""" + else: + create_proxy = "" + + if k is SchemaKind.functional: + assert self.backend_index.dispatch_key in ( + DispatchKey.Meta, + DispatchKey.CPU, + DispatchKey.CUDA, + DispatchKey.MPS, + DispatchKey.XPU, + DispatchKey.CompositeExplicitAutogradNonFunctional, + ) + return f"""{maybe_set_guard_line} +outputs_[output_idx] = create_out(sizes, strides, options);""" + elif k is SchemaKind.inplace: + return f"""{maybe_set_guard_line} +const auto& out = outputs_[output_idx].get(); +check_inplace(out, sizes, options); +{create_proxy}""" + elif k is SchemaKind.out: + return f"""{maybe_set_guard_line} +const auto& out = outputs_[output_idx].get(); +resize_out(out, sizes, strides, options); +{create_proxy}""" + elif k is SchemaKind.mutable or k is SchemaKind.scratch: + raise AssertionError( + f"{k} structured operators are currently not supported" + ) + else: + assert_never(k) + + # returns the definition of a ctor, as well as how to construct + # this class to a variable named op + def gen_class_ctor(self, k: SchemaKind, class_name: str, returns: int) -> str: + if k is SchemaKind.functional: + return "" + elif k is SchemaKind.inplace: + # TODO: Make sure out argument is guaranteed to be self + return f"{class_name}(Tensor& self) : outputs_{{std::ref(self)}} {{}}" + elif k is SchemaKind.out: + out_args = ", ".join(f"Tensor& out{i}" for i in range(returns)) + out_refs = ", ".join(f"std::ref(out{i})" for i in range(returns)) + return f"{class_name}({out_args}) : outputs_{{ {out_refs} }} {{}}" + elif k is SchemaKind.mutable or k is SchemaKind.scratch: + raise AssertionError( + f"{k} structured operators are currently not supported" + ) + else: + assert_never(k) + + def gen_class( + self, + f: NativeFunction, + k: SchemaKind, + *, + class_name: str, + parent_class: str, + generate_super: bool, + ) -> str: + if k is SchemaKind.functional: + output_type = "Tensor" + output_value = "outputs_[output_idx]" + proxy_field = "" + elif k is SchemaKind.inplace: + output_type = "std::reference_wrapper" + output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()" + proxy_field = f"std::array<::std::optional, {len(f.func.returns)}> proxy_outputs_;" + elif k is SchemaKind.out: + output_type = "std::reference_wrapper" + output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()" + proxy_field = f"std::array<::std::optional, {len(f.func.returns)}> proxy_outputs_;" + else: + raise RuntimeError(f"Unsupported SchemaKind {k}") + + if self.backend_index.dispatch_key == DispatchKey.CUDA: + if self.rocm: + guard_field = "c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;" + else: + guard_field = "c10::cuda::OptionalCUDAGuard guard_;" + elif ( + self.backend_index.dispatch_key + == DispatchKey.CompositeExplicitAutogradNonFunctional + ): + guard_field = "c10::OptionalDeviceGuard guard_;" + elif self.backend_index.dispatch_key == DispatchKey.MPS: + # TODO: Move to OptionalMPSGuard. + guard_field = "c10::OptionalDeviceGuard guard_;" + else: + guard_field = "" + + indent = " " * 4 + class_ctor_str = self.gen_class_ctor(k, class_name, len(f.func.returns)) + lines = ( + f"struct {class_name} final : public {parent_class} {{", + f"{textwrap.indent(class_ctor_str, indent)}", + f"{textwrap.indent(self.gen_class_set_output_functions(k, parent_class, generate_super), indent)}", + " const Tensor& maybe_get_output(int64_t output_idx) override {", + f" return {output_value};\n", # type: ignore[possibly-undefined] # TODO: audit + " }", + # type: ignore[possibly-undefined] # TODO: audit + f" std::array<{output_type}, {len(f.func.returns)}> outputs_;", + f"{textwrap.indent(proxy_field, indent)}", # type: ignore[possibly-undefined] # TODO: audit + f"{textwrap.indent(guard_field, indent)}", + "};", + ) + return "\n".join(line for line in lines if line) + + @method_with_native_function + def gen_one(self, f: NativeFunction) -> str | None: + assert not f.manual_kernel_registration + + if ( + self.target is Target.REGISTRATION + and not self.selector.is_native_function_selected(f) + ): + return None + + # TODO: Now, there is something interesting going on here. In the code below, + # we generate CompositeExplicitAutogradNonFunctional implementations of functional and inplace + # based on the out implementation. But in fact, out is definable by + # functional too (just not very efficiently), and this is honestly the + # MORE likely situation for a backend implementor. How do we pick? + # Well, taking a page from Haskell type classes and default methods, + # we could conceivably register a circular definition (out in terms + # of functional, and functional in terms of out) and just require + # someone to implement one or the other. We'd have to do a little bit + # of work to not register one of these "weak" definitions unless there + # is a strong definition somewhere in the DAG! So it's not implemented yet. + if ( + self.backend_index.dispatch_key + == DispatchKey.CompositeExplicitAutogradNonFunctional + and f.func.kind() is SchemaKind.out + ): + # Never generate a default implementation for out, that's what you + # have to define as a backend implementor + return None + + # Note [Direct dispatch bindings] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Signature of the non-dispatched function we'll expose in a header + # (e.g., at::cpu::add). We don't generate methods (TODO: do this + # when CPUTensor class is a thing); nor do we generate fallback + # bindings for manual_cpp_binding functions. + cpp_sig_group = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=False + ) + + # Signature of the wrapper function we'll register to the dispatcher + kern = self.backend_index.get_kernel(f) + sig = NativeSignature( + f.func, + prefix=f"wrapper_{self.backend_index.dispatch_key}_", + symint=kern is not None and kern.supports_symint(), + ) + + if self.target is Target.NAMESPACED_DECLARATION: + result = "" + for cpp_sig in cpp_sig_group.signatures(symint=self.symint): + result += f"TORCH_API {cpp_sig.decl()};\n" + return result + + elif self.target is Target.NAMESPACED_DEFINITION: + + def generate_defn(cpp_sig: CppSignature) -> str: + return f""" +{cpp_sig.defn()} {{ +return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))}); +}} +""" + + result = "" + for cpp_sig in cpp_sig_group.signatures(symint=self.symint): + result += generate_defn(cpp_sig) + return result + + elif self.target is Target.ANONYMOUS_DEFINITION: + k = f.func.kind() + + # Construct the body of the wrapper function with signature sig + sig_body = [] + # We'll use context to keep track of any variables we've brought + # into scope while generating code + context: list[Binding | Expr] = list(sig.arguments()) + + # Initialize the class corresponding to this structured + # operator; feeding it the output argument(s) if it is known + if self.backend_index.dispatch_key is DispatchKey.Meta: + class_name = f"structured_{meta.name(self.g)}_meta_{k.name}" + parent_class = f"at::meta::structured_{meta.name(self.g)}" + elif ( + self.backend_index.dispatch_key + is DispatchKey.CompositeExplicitAutogradNonFunctional + ): + # TODO: dedup this branch + class_name = f"structured_{meta.name(self.g)}_default_backend_{k.name}" + parent_class = f"at::meta::structured_{meta.name(self.g)}" + else: + metadata = self.backend_index.get_kernel(self.g) + assert metadata is not None + class_name = f"structured_{metadata.kernel}_{k.name}" + parent_class = f"{metadata.cpp_namespace}::structured_{metadata.kernel}" + + if self.backend_index.device_guard: + device_check_args = itertools.chain( + f.func.arguments.out, f.func.arguments.flat_positional + ) + sig_body.append( + RegisterDispatchKey.gen_device_check( + f.device_check, list(device_check_args), sig.name() + ) + ) + + if k is SchemaKind.functional: + sig_body.append(f"{class_name} op;") + elif k is SchemaKind.inplace: + sig_body.append(f"{class_name} op(self);") + elif k is SchemaKind.out: + out_args_str = ", ".join(a.name for a in f.func.arguments.out) + sig_body.append(f"{class_name} op({out_args_str});") + + # Translate the input native arguments into structured + # arguments for the meta call + meta_exprs = ", ".join( + e.expr + for e in translate( + context, structured.meta_arguments(self.g), method=False + ) + ) + + if self.g.out.precomputed: + # If this function group has precomputed elements, the meta function + # returns a struct containing them which must be saved so that it + # can be unpacked when generating code to call the impl. + sig_body.append(f"auto precompute = op.meta({meta_exprs});") + + # Put all of the contents of the precompute struct into the context + # so that translate will be able to return the correct args for the + # call to the impl. + precomputed_values = [ + *self.g.out.precomputed.replace.values(), + self.g.out.precomputed.add, + ] + for precomputed_elems in precomputed_values: + for arg in precomputed_elems: + context.append( + Expr( + expr=f"precompute.{arg.name}", + type=structured.argument_type(arg, binds=arg.name), + ) + ) + + # Add a use of the precompute struct so FB internal compilers don't + # complain that there is an unused variable. + sig_body.append("(void)precompute;") + else: + sig_body.append(f"op.meta({meta_exprs});") + + # After running meta, op.outputs_ is guaranteed to be valid; + # add it to the context + out_args = structured.out_arguments(self.g) + for i, out_arg in enumerate(out_args): + assert ConstRefCType(BaseCType(tensorT)) == out_arg.nctype.type + + if k is SchemaKind.out: + expr = f"op.maybe_get_output({i})" + else: + expr = f"op.outputs_[{i}]" + + context.append( + Expr( + expr=expr, + # TODO: Stop hardcoding that the output type is a Tensor. Note + # that for the codegen here this is fine because outputs_ is + # hardcoded to be tensor already + type=NamedCType( + out_arg.nctype.name, MutRefCType(BaseCType(tensorT)) + ), + ) + ) + + # With the expanded context, do the impl call (if not a meta + # function) + if ( + self.backend_index.dispatch_key + == DispatchKey.CompositeExplicitAutogradNonFunctional + ): + # TODO: https://github.com/pytorch/pytorch/issues/53023 + out_sig_group = CppSignatureGroup.from_native_function( + self.g.out, method=False, fallback_binding=f.manual_cpp_binding + ) + out_sig = out_sig_group.most_faithful_signature() + api_name = out_sig.name() + out_exprs = ", ".join( + e.expr + for e in translate(context, out_sig.arguments(), method=False) + ) + # TODO: I think this means structured won't work with method + # only functions (but maybe you're saved by faithful? iunno.) + # NB: Originally I wrote this as an at::redispatch call, but + # I got in trouble because that meant I needed a DispatchKeySet + # in the wrapper function, which meant I needed a DispatchKeySet + # in the DispatchKeyFunctions declarations, but the defined API + # there does NOT permit a dispatch key set. I think you can + # probably unwind this by calling some function to do the TLS + # fetch and get the DispatchKeySet when you don't have it, but + # I didn't do it for this version + sig_body.append(f"at::{api_name}({out_exprs});") + elif self.backend_index.dispatch_key != DispatchKey.Meta: + impl_exprs = ", ".join( + e.expr + for e in translate( + context, structured.impl_arguments(self.g), method=False + ) + ) + sig_body.append(f"op.impl({impl_exprs});") + + # Go over each output, and check if there is a proxy created for it. + # If so, copy it over to the original output. + if k is SchemaKind.out or k is SchemaKind.inplace: + for i in range(len(f.func.returns)): + sig_body.append( + f"if (op.proxy_outputs_[{i}].has_value()) op.outputs_[{i}].get().copy_(*op.proxy_outputs_[{i}]);" + ) + + # Destructively return the final tensors + # TODO: Do this in translate instead + if k is SchemaKind.functional: + if len(f.func.returns) == 1: + ret_expr = "std::move(op.outputs_[0])" # small optimization + else: + moved = ", ".join( + f"std::move(op.outputs_[{i}])" + for i in range(len(f.func.returns)) + ) + ret_expr = f"std::make_tuple({moved})" + elif k is SchemaKind.inplace: + ret_expr = "self" + elif k is SchemaKind.out: + if len(f.func.returns) == 1: + ret_expr = f.func.arguments.out[0].name + else: + refs = ", ".join(a.name for a in f.func.arguments.out) + ret_expr = f"std::forward_as_tuple({refs})" + sig_body.append(f"return {ret_expr};") # type: ignore[possibly-undefined] # TODO: audit + + sig_body_str = "\n".join(sig_body) + + # For an overview of what this template code looks like, see + # https://github.com/pytorch/rfcs/pull/9 + return f"""\ +{self.gen_class( +f, k, +class_name=class_name, +parent_class=parent_class, +generate_super=self.g.out.structured_inherits is not None +)} + +{sig.defn()} {{ +{sig_body_str} +}} +""" + + elif self.target is Target.REGISTRATION: + return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));' + else: + assert_never(self.target) + # Silence mypy's "Missing return statement" error + return None diff --git a/torchgen/dest/ufunc.py b/torchgen/dest/ufunc.py new file mode 100644 index 00000000000..073df2eb184 --- /dev/null +++ b/torchgen/dest/ufunc.py @@ -0,0 +1,551 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Sequence, TYPE_CHECKING + +import torchgen.api.ufunc as ufunc +from torchgen.api.translate import translate +from torchgen.api.types import ( + BaseCType, + Binding, + CType, + Expr, + NamedCType, + opmath_t, + scalar_t, + StructuredImplSignature, + VectorizedCType, +) +from torchgen.context import with_native_function +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + DispatchKey, + NativeFunctionsGroup, + ScalarType, + UfuncKey, +) +from torchgen.utils import OrderedSet + + +if TYPE_CHECKING: + from torchgen.api.ufunc import UfunctorBindings + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# CUDA STUFF +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + +# NB: not bothering to generate dispatch stub forward declaration in header, +# we can just paste it whereever necessary + +# TODO: use BackendIndex +# dispatch_key: DispatchKey # only CPU/CUDA right now + + +# Represents functors for implementing CUDA ufuncs. +# Functors are templated by scalar_t because when USERS instantiate functors +# they are templated. A functor looks something like this: +# +# template +# struct CUDAFunctorOnSelf_add { +# using opmath_t = at::opmath_type; +# opmath_t other_; +# opmath_t alpha_; +# CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha) +# : other_(other), alpha_(alpha) {} +# __device__ scalar_t operator()(scalar_t self) { +# return ufunc::add(static_cast(self), other_, alpha_); +# } +# }; +# +@dataclass(frozen=True) +class UfunctorSignature: + g: NativeFunctionsGroup + scalar_tensor_idx: int | None + name: str + + def arguments(self) -> UfunctorBindings: + return ufunc.ufunctor_arguments( + self.g, scalar_tensor_idx=self.scalar_tensor_idx, scalar_t=scalar_t + ) + + def fields(self) -> list[Binding]: + # fields are renamed to have a trailing underscore, as is conventional + return [b.rename(f"{b.name}_") for b in self.arguments().ctor] + + def returns_type(self) -> CType: + # TODO: don't hardcode; return type will be inferred based on tags on + # the native function + return BaseCType(scalar_t) + + def decl_fields(self) -> str: + return "\n".join(f"{f.type} {f.name};" for f in self.fields()) + + def inline_defn_ctor(self) -> str: + args_str = ", ".join(a.decl() for a in self.arguments().ctor) + # NB: hypothetically could do this with translate but the + # transition here is very regular + init_str = ", ".join(f"{a.name}_({a.name})" for a in self.arguments().ctor) + return f"{self.name}({args_str}) : {init_str} {{}}" + + def decl_apply(self) -> str: + args_str = ", ".join(a.decl() for a in self.arguments().apply) + return f"{self.returns_type().cpp_type()} operator()({args_str}) const" + + +@dataclass(frozen=True) +class UfuncSignature: + g: NativeFunctionsGroup + name: str + compute_t: CType + + def arguments(self) -> list[Binding]: + return ufunc.ufunc_arguments(self.g, compute_t=self.compute_t) + + def call(self, ctx: Sequence[Binding | Expr]) -> str: + return f"{self.name}({', '.join(a.expr for a in translate(ctx, self.arguments()))})" + + +# steps: +# 1. take the functional signature +# 2. use api.ufunc to convert it to template signature. this establishes +# the type of the template function +# 3. use api.ufunc (II) to generate a split struct / operator() signature. +# this establish context in which we call the template signature +# +# StructuredImplSignature context +# ~> functor constructor sig +# +# Functor constructor context +# ~> functor fields sig +# +# Functor apply context (functor fields + functor apply sig) +# ~> template sig +# + + +def eligible_for_binary_scalar_specialization(g: NativeFunctionsGroup) -> bool: + num_tensors = sum( + 1 for a in g.functional.func.arguments.flat_non_out if a.type.is_tensor_like() + ) + return num_tensors == 2 + + +def compute_ufunc_cuda_functors( + g: NativeFunctionsGroup, +) -> tuple[dict[ScalarType, dict[UfuncKey, UfunctorSignature]], str]: + # First, build the functors. + ufunctor_sigs: dict[ScalarType, dict[UfuncKey, UfunctorSignature]] = {} + ufunctors: list[str] = [] + loops = g.out.ufunc_inner_loop + scalar_tensor_idx_lookup = { + UfuncKey.CUDAFunctorOnSelf: 1, + UfuncKey.CUDAFunctorOnOther: 0, + UfuncKey.CUDAFunctor: None, + } + if eligible_for_binary_scalar_specialization(g): + keys = [ + UfuncKey.CUDAFunctorOnSelf, + UfuncKey.CUDAFunctorOnOther, + UfuncKey.CUDAFunctor, + ] + else: + keys = [UfuncKey.CUDAFunctor] + for k in [UfuncKey.CUDAFunctorOnSelf, UfuncKey.CUDAFunctorOnOther]: + assert k not in loops, f"cannot use {k} on non-binary function" + for k in keys: + # If the key was directly defined, skip functor codegen; we assume the + # user already done it for us + if k in loops: + ufunctor_sig = UfunctorSignature( + g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=loops[k].name + ) + for dtype in loops[k].supported_dtypes: + ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig + continue + + # Note [ScalarOnly and Generic must match names for CUDA] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Otherwise, look in ANY of the generic entries. For simplicity of + # codegen, both ScalarOnly and Generic are defined, the ufunc name + # must match (if they didn't match, we'd have to generate distinct + # functors per dtype, which is awful, so we're not going to do it unless + # someone really forces us to) + ufunc_name = None + supported_dtypes: OrderedSet[ScalarType] = OrderedSet() + for lk in [UfuncKey.ScalarOnly, UfuncKey.Generic]: + if lk not in loops: + continue + if ufunc_name is None: + ufunc_name = loops[lk].name + else: + # See Note [ScalarOnly and Generic must match names for CUDA] + assert ( + ufunc_name == loops[lk].name + ), "ScalarOnly and Generic must have same ufunc name" + supported_dtypes |= loops[lk].supported_dtypes + assert ufunc_name is not None + + name = f"{k}_{ufunc_name}" + ufunctor_sig = UfunctorSignature( + g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=name + ) + for dtype in supported_dtypes: + ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig + + ufunc_sig = UfuncSignature( + g, name=f"ufunc::{ufunc_name}", compute_t=BaseCType(opmath_t) + ) + apply_ctx = ufunctor_sig.fields() + ufunctor_sig.arguments().apply + ufunctors.append( + f""" +template +struct {ufunctor_sig.name} {{ + using opmath_t = at::opmath_type; + {ufunctor_sig.decl_fields()} + {ufunctor_sig.inline_defn_ctor()} + __device__ {ufunctor_sig.decl_apply()} {{ + return {ufunc_sig.call(apply_ctx)}; + }} +}}; +""" + ) + + return ufunctor_sigs, "\n".join(ufunctors) + + +@dataclass(frozen=True) +class BinaryScalarSpecializationConfig: + scalar_idx: int + ctor_tensor: str + ufunc_key: UfuncKey + + +BinaryScalarSpecializationConfigs = [ + BinaryScalarSpecializationConfig( + scalar_idx=0, + ctor_tensor="self", + ufunc_key=UfuncKey.CUDAFunctorOnOther, + ), + BinaryScalarSpecializationConfig( + scalar_idx=1, + ctor_tensor="other", + ufunc_key=UfuncKey.CUDAFunctorOnSelf, + ), +] + + +def compute_ufunc_cuda_dtype_body( + g: NativeFunctionsGroup, + dtype: ScalarType, + inner_loops: dict[UfuncKey, UfunctorSignature], + parent_ctx: Sequence[Binding], +) -> str: + body = "using opmath_t = at::opmath_type;" + body += "if (false) {}\n" # for ease of codegen + for config in BinaryScalarSpecializationConfigs: + if config.ufunc_key not in inner_loops: + continue + ufunctor_sig = inner_loops[config.ufunc_key] + scalar_idx = config.scalar_idx + 1 + # Make a copy and at the same time widen the type (not permissible + # without copy; we don't want to mutate the input argument anyway) + ctx: list[Expr | Binding] = list(parent_ctx) + ctx.append( + Expr( + expr=f"iter.scalar_value({scalar_idx})", + type=NamedCType(config.ctor_tensor, BaseCType(opmath_t)), + ) + ) + ufunctor_ctor_exprs_str = ", ".join( + a.expr for a in translate(ctx, ufunctor_sig.arguments().ctor) + ) + + # NB: ufunctor must be allocated before iter.remove_operand is called, + # as it relies on iter + body += f"""\ +else if (iter.is_cpu_scalar({scalar_idx})) {{ + {ufunctor_sig.name} ufunctor({ufunctor_ctor_exprs_str}); + iter.remove_operand({scalar_idx}); + gpu_kernel(iter, ufunctor); +}}""" + + ufunctor_sig = inner_loops[UfuncKey.CUDAFunctor] + ufunctor_ctor_exprs_str = ", ".join( + a.expr for a in translate(parent_ctx, ufunctor_sig.arguments().ctor) + ) + body += f""" +else {{ + gpu_kernel(iter, {ufunctor_sig.name}({ufunctor_ctor_exprs_str})); +}} + """ + return body + + +@with_native_function +def compute_ufunc_cuda(g: NativeFunctionsGroup) -> str: + # First, build the functors, indexing them by dtype + ufunctor_sigs, ufunctors = compute_ufunc_cuda_functors(g) + + # Next, build the conditionals + sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CUDA)) + dtype_cases = [] + for dtype, inner_ufunc_sigs in ufunctor_sigs.items(): + dtype_cases.append( + f""" +AT_DISPATCH_CASE(at::ScalarType::{dtype}, + [&]() {{ + {compute_ufunc_cuda_dtype_body(g, dtype, inner_ufunc_sigs, sig.arguments())} + }} +) +""" + ) + + dtype_cases_str = "\n".join(dtype_cases) + + stub_sig = StubSignature(g) + + return f""" +{ufunctors} + +{stub_sig.type_defn()}; +{stub_sig.dispatch_decl()}; + +{stub_sig.kernel_defn()} {{ + AT_DISPATCH_SWITCH(iter.common_dtype(), "{sig.name}", + {dtype_cases_str} + ); +}} +REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name}); + +{sig.defn()} {{ + {stub_sig.direct_call(sig.arguments())}; +}} +""" + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# CPU STUFF +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +@dataclass(frozen=True) +class StubSignature: + g: NativeFunctionsGroup + + @property + def name(self) -> str: + return f"{str(self.g.functional.func.name.name)}_stub" + + @property + def kernel_name(self) -> str: + return f"{str(self.g.functional.func.name.name)}_kernel" + + @property + def type_name(self) -> str: + return f"{str(self.g.functional.func.name.name)}_fn" + + def arguments(self) -> list[Binding]: + return ufunc.stub_arguments(self.g) + + def type(self) -> str: + cpp_args = self.arguments() + return f"void(*)(TensorIteratorBase&, {', '.join(a.type for a in cpp_args)})" + + def dispatch_decl(self) -> str: + return f"DECLARE_DISPATCH({self.type_name}, {self.name})" + + def dispatch_defn(self) -> str: + return f"DEFINE_DISPATCH({self.name})" + + def kernel_defn(self) -> str: + return f"void {self.kernel_name}(TensorIteratorBase& iter, {', '.join(a.defn() for a in self.arguments())})" + + def type_defn(self) -> str: + return f"using {self.type_name} = {self.type()}" + + # must be called from context where this is TensorIteratorBase* + def call(self, ctx: Sequence[Binding]) -> str: + return f"{self.name}(device_type(), *this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})" + + # used in CUDA to skip the unnecessary dynamic dispatch + def direct_call(self, ctx: Sequence[Binding]) -> str: + return f"{self.kernel_name}(*this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})" + + +@with_native_function +def compute_ufunc_cpu(g: NativeFunctionsGroup) -> str: + stub_sig = StubSignature(g) + sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CPU)) + + return f""" +{stub_sig.type_defn()}; +{stub_sig.dispatch_decl()}; +{stub_sig.dispatch_defn()}; + +{sig.defn()} {{ + {stub_sig.call(sig.arguments())}; +}} +""" + + +def compute_ufunc_cpu_dtype_body( + g: NativeFunctionsGroup, + dtype: ScalarType, + inner_loops: dict[UfuncKey, UfuncSignature], + parent_ctx: Sequence[Binding], +) -> str: + assert UfuncKey.CPUScalar in inner_loops, f"{dtype}, {inner_loops.keys()}" + assert inner_loops.keys() <= {UfuncKey.CPUScalar, UfuncKey.CPUVector} + scalar_loop = inner_loops[UfuncKey.CPUScalar] + vec_loop = None + if UfuncKey.CPUVector in inner_loops: + vec_loop = inner_loops[UfuncKey.CPUVector] + + # NB: We DON'T use translate here, because translate is + # incapable of CSE'ing the scalar accesses in case it is also + # used by Vectorized; also, the unpacking here is very simple + # and only affects Scalar; everything else is implicitly captured + # by the lambda + + # Setup scalar in scope + body = [] + ctx = [] + for b in parent_ctx: + if isinstance(b.argument, Argument) and b.argument.type != BaseType( + BaseTy.Scalar + ): + continue + body.append(f"auto _s_{b.name} = {b.name}.to();") + ctx.append(Expr(f"_s_{b.name}", NamedCType(b.nctype.name, BaseCType(scalar_t)))) + if vec_loop is not None: + for b in parent_ctx: + if isinstance(b.argument, Argument) and b.argument.type != BaseType( + BaseTy.Scalar + ): + continue + body.append( + f"auto _v_{b.name} = at::vec::Vectorized(_s_{b.name});" + ) + ctx.append( + Expr( + f"_v_{b.name}", + NamedCType(b.nctype.name, VectorizedCType(BaseCType(scalar_t))), + ) + ) + + # Setup lambda signature + # NB: simplified version of ufunctor_arguments + scalar_bindings = [] + vec_bindings = [] + for a in g.functional.func.arguments.flat_non_out: + if not a.type.is_tensor_like(): + continue + assert a.type == BaseType(BaseTy.Tensor) + scalar_bindings.append( + Binding( + name=a.name, + nctype=NamedCType(a.name, BaseCType(scalar_t)), + argument=a, + ) + ) + if vec_loop is not None: + vec_bindings.append( + Binding( + name=a.name, + nctype=NamedCType(a.name, VectorizedCType(BaseCType(scalar_t))), + argument=a, + ) + ) + + def with_ctx(b: Sequence[Binding]) -> list[Expr | Binding]: + r: list[Expr | Binding] = [] + r.extend(ctx) + r.extend(b) + return r + + body_str = "\n".join(body) + if vec_loop is not None: + return f""" +{body_str} +cpu_kernel_vec(iter, + [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }}, + [=]({', '.join(b.decl() for b in vec_bindings)}) {{ return {vec_loop.call(with_ctx(vec_bindings))}; }} +); +""" + else: + return f""" +{body_str} +cpu_kernel(iter, + [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }} +); +""" + + +@with_native_function +def compute_ufunc_cpu_kernel(g: NativeFunctionsGroup) -> str: + stub_sig = StubSignature(g) + + # Reindex the ufunc by dtypes; processing generic/scalaronly as well + loops = g.out.ufunc_inner_loop + ufunc_sigs: dict[ScalarType, dict[UfuncKey, UfuncSignature]] = {} + for k in [UfuncKey.CPUScalar, UfuncKey.CPUVector]: + lks = [] + # ORDER MATTERS: this specifies overriding precedence + if k in loops: # should happen rarely + lks.append(k) + if UfuncKey.ScalarOnly in loops and k is UfuncKey.CPUScalar: + lks.append(UfuncKey.ScalarOnly) + if UfuncKey.Generic in loops: + lks.append(UfuncKey.Generic) + # TODO: don't hardcode ufunc:: namespace here, should be centralized smh + for lk in lks: + for dtype in loops[lk].supported_dtypes: + compute_t: CType + if k is UfuncKey.CPUScalar: + compute_t = BaseCType(scalar_t) + elif k is UfuncKey.CPUVector: + compute_t = VectorizedCType(BaseCType(scalar_t)) + else: + raise AssertionError + inner_ufunc_sigs = ufunc_sigs.setdefault(dtype, {}) + if k not in inner_ufunc_sigs: + inner_ufunc_sigs[k] = UfuncSignature( + g, name=f"ufunc::{loops[lk].name}", compute_t=compute_t + ) + + # Build the conditionals + dtype_cases = [] + for dtype, inner_ufunc_sigs in ufunc_sigs.items(): + dtype_cases.append( + f""" +AT_DISPATCH_CASE(at::ScalarType::{dtype}, + [&]() {{ + {compute_ufunc_cpu_dtype_body(g, dtype, inner_ufunc_sigs, stub_sig.arguments())} + }} +) +""" + ) + + dtype_cases_str = "\n".join(dtype_cases) + return f""" +namespace {{ + +{stub_sig.kernel_defn()} {{ + AT_DISPATCH_SWITCH(iter.common_dtype(), "{stub_sig.name}", + {dtype_cases_str} + ); +}} + +}} // anonymous namespace + +{stub_sig.type_defn()}; +{stub_sig.dispatch_decl()}; +REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name}); +""" diff --git a/torchgen/executorch/__init__.py b/torchgen/executorch/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchgen/executorch/api/__init__.py b/torchgen/executorch/api/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchgen/executorch/api/custom_ops.py b/torchgen/executorch/api/custom_ops.py new file mode 100644 index 00000000000..bbe62c72f68 --- /dev/null +++ b/torchgen/executorch/api/custom_ops.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from collections import defaultdict +from dataclasses import dataclass +from typing import Sequence, TYPE_CHECKING + +from torchgen import dest + + +# disable import sorting to avoid circular dependency. +from torchgen.api.types import DispatcherSignature # usort: skip +from torchgen.context import method_with_native_function +from torchgen.model import BaseTy, BaseType, DispatchKey, NativeFunction, Variant +from torchgen.utils import concatMap, Target + + +if TYPE_CHECKING: + from torchgen.executorch.model import ETKernelIndex + from torchgen.selective_build.selector import SelectiveBuilder + + +# Generates RegisterKernelStub.cpp, which provides placeholder kernels for custom operators. This will be used at +# model authoring side. +@dataclass(frozen=True) +class ComputeNativeFunctionStub: + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + if Variant.function not in f.variants: + return None + + sig = DispatcherSignature.from_schema( + f.func, prefix=f"wrapper_CPU_{f.func.name.overload_name}_", symint=False + ) + assert sig is not None + if len(f.func.returns) == 0: + ret_name = "" + elif len(f.func.returns) == 1: + if f.func.arguments.out: + ret_name = f.func.arguments.out[0].name + else: + ret_name = next( + ( + a.name + for a in f.func.arguments.flat_non_out + if a.type == f.func.returns[0].type + ), + "", + ) + if not ret_name: + # if return type is tensor + if f.func.returns[0].type == BaseType(BaseTy.Tensor): + # Returns an empty tensor + ret_name = "at::Tensor()" + else: + raise Exception( # noqa: TRY002 + f"Can't handle this return type {f.func}" + ) # noqa: TRY002 + elif len(f.func.arguments.out) == len(f.func.returns): + # Returns a tuple of out arguments + tensor_type = "at::Tensor &" + comma = ", " + ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>( + {comma.join([r.name for r in f.func.arguments.out])} + )""" + else: + assert all( + a.type == BaseType(BaseTy.Tensor) for a in f.func.returns + ), f"Only support tensor returns but got {f.func.returns}" + # Returns a tuple of empty tensors + tensor_type = "at::Tensor" + comma = ", " + ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>( + {comma.join(["at::Tensor()" for _ in f.func.returns])} + )""" + ret_str = f"return {ret_name};" if len(f.func.returns) > 0 else "" + return f""" +{sig.defn()} {{ + {ret_str} +}} + """ + + +def gen_custom_ops_registration( + *, + native_functions: Sequence[NativeFunction], + selector: SelectiveBuilder, + kernel_index: ETKernelIndex, + rocm: bool, +) -> tuple[str, str]: + """ + Generate custom ops registration code for dest.RegisterDispatchKey. + + :param native_functions: a sequence of `NativeFunction` + :param selector: for selective build. + :param kernel_index: kernels for all the ops. + :param rocm: bool for dest.RegisterDispatchKey. + :return: generated C++ code to register custom operators into PyTorch + """ + + # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet. + # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex. + + dispatch_key = DispatchKey.CPU + backend_index = kernel_index._to_backend_index() + static_init_dispatch_registrations = "" + ns_grouped_native_functions: dict[str, list[NativeFunction]] = defaultdict(list) + for native_function in native_functions: + ns_grouped_native_functions[native_function.namespace].append(native_function) + + for namespace, functions in ns_grouped_native_functions.items(): + if len(functions) == 0: + continue + dispatch_registrations_body = "\n".join( + list( + concatMap( + dest.RegisterDispatchKey( + backend_index, + Target.REGISTRATION, + selector, + rocm=rocm, + symint=False, + class_method_name=None, + skip_dispatcher_op_registration=False, + ), + functions, + ) + ) + ) + static_init_dispatch_registrations += f""" +TORCH_LIBRARY_IMPL({namespace}, {dispatch_key}, m) {{ +{dispatch_registrations_body} +}};""" + anonymous_definition = "\n".join( + list( + concatMap( + dest.RegisterDispatchKey( + backend_index, + Target.ANONYMOUS_DEFINITION, + selector, + rocm=rocm, + symint=False, + class_method_name=None, + skip_dispatcher_op_registration=False, + ), + native_functions, + ) + ) + ) + return anonymous_definition, static_init_dispatch_registrations diff --git a/torchgen/executorch/api/et_cpp.py b/torchgen/executorch/api/et_cpp.py new file mode 100644 index 00000000000..76cebcd0f0f --- /dev/null +++ b/torchgen/executorch/api/et_cpp.py @@ -0,0 +1,370 @@ +from __future__ import annotations + +from typing import Sequence + +from torchgen import local +from torchgen.api.types import ( + ArgName, + BaseCType, + Binding, + ConstRefCType, + CType, + MutRefCType, + NamedCType, + SpecialArgName, + TupleCType, + VectorCType, + voidT, +) +from torchgen.executorch.api.types import ( + ArrayRefCType, + BaseTypeToCppMapping, + OptionalCType, + scalarT, + tensorListT, + tensorT, +) +from torchgen.model import ( + Argument, + Arguments, + BaseTy, + BaseType, + ListType, + NativeFunction, + OptionalType, + Return, + SelfArgument, + TensorOptionsArguments, + Type, +) +from torchgen.utils import assert_never + + +""" +This file describes the translation of JIT schema to the public C++ API, which is what people use when they call +functions like at::add. It also serves as a native function API, which is the signature of kernels, +since in Executorch CppSignature is the same as NativeSignature. + +Difference between this file and torchgen.api.cpp.py: + + - Executorch doesn't support TensorOptions, however in this file we still keep the logic here to be compatible with + torchgen.api.cpp, so that we can do stuff like ATen mode (running ATen kernels in Executorch). + + - Executorch doesn't support Dimname. + + - Executorch runtime doesn't support SymInt, will treat it as int. +""" + + +# Translation of "value types" in JIT schema to C++ API type. Value +# types look the same no matter if they are argument types or return +# types. Returns None if the type in question is not a value type. +def valuetype_type( + t: Type, + *, + binds: ArgName, + remove_non_owning_ref_types: bool = False, +) -> NamedCType | None: + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar: + return None + # For SymInt we simply treat it as int. + elif str(t) == "SymInt": + return NamedCType(binds, BaseCType(BaseTypeToCppMapping[BaseTy.int])) + if remove_non_owning_ref_types: + if t.name == BaseTy.str: + raise AssertionError( + "string ref->value conversion: not implemented yet" + ) + # All other BaseType currently map directly to BaseCppTypes. + return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name])) + elif isinstance(t, OptionalType): + elem = valuetype_type(t.elem, binds=binds) + if elem is None: + return None + return NamedCType(binds, OptionalCType(elem.type)) + elif isinstance(t, ListType): + if str(t.elem) == "bool": + assert t.size is not None + return NamedCType( + binds, ArrayRefCType(BaseCType(BaseTypeToCppMapping[BaseTy.bool])) + ) + else: + return None + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +# Translation of types occurring in JIT arguments to a C++ argument type. +# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type. +# For example, we'll return std::vector instead of IntArrayRef. +# See Note [translation from C++ reference to value types] +def argumenttype_type( + t: Type, + *, + mutable: bool, + binds: ArgName, + remove_non_owning_ref_types: bool = False, +) -> NamedCType: + # If it's a value type, do the value type translation + r = valuetype_type( + t, + binds=binds, + remove_non_owning_ref_types=remove_non_owning_ref_types, + ) + if r is not None: + return r + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + if mutable and not local.use_const_ref_for_mutable_tensors(): + return NamedCType(binds, MutRefCType(BaseCType(tensorT))) + else: + return NamedCType(binds, ConstRefCType(BaseCType(tensorT))) + elif t.name == BaseTy.Scalar: + return NamedCType(binds, ConstRefCType(BaseCType(scalarT))) + else: + raise AssertionError(f"base type should have been value type {t}") + elif isinstance(t, OptionalType): + if str(t.elem) == "Tensor": + if mutable and not local.use_const_ref_for_mutable_tensors(): + return NamedCType( + binds, MutRefCType(BaseCType(tensorT)) + ) # TODO: fix this discrepancy + else: + return NamedCType( + binds, ConstRefCType(OptionalCType(BaseCType(tensorT))) + ) + elif str(t.elem) == "Scalar": + return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT)))) + elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) + return NamedCType(binds, OptionalCType(elem.type)) + elif isinstance(t, ListType): + # TODO: keeping these special cases for Tensor[] and Tensor?[] so that we can hookup with ATen kernels. + if str(t.elem) == "Tensor": + return NamedCType(binds, BaseCType(tensorListT)) + elif str(t.elem) == "Dimname": + raise NotImplementedError("Executorch doesn't support Dimname") + elif str(t.elem) == "Tensor?": + return NamedCType(binds, ArrayRefCType(OptionalCType(BaseCType(tensorT)))) + elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) + return NamedCType(binds, ArrayRefCType(elem.type)) + else: + raise AssertionError(f"unrecognized type {repr(t)}") + + +# Translate a JIT argument into its C++ type +def argument_type(a: Argument, *, binds: ArgName) -> NamedCType: + return argumenttype_type(a.type, mutable=a.is_write, binds=binds) + + +# Translation of a (non-multi) return type from JIT to C++ +# N.B: returntype_type returns a CType, not a NamedCType. +# This is mostly because of the mismatch between return types and return names. +# e.g. a function with a return type of 'void' has 0 return names, +# and a function with a return type of 'std::tuple' has >1 return name. +def returntype_type(t: Type, *, mutable: bool) -> CType: + # placeholder is ignored + r = valuetype_type(t, binds="__placeholder__") + if r is not None: + return r.type + + if isinstance(t, BaseType): + if t.name == BaseTy.Tensor: + if mutable: + if local.use_const_ref_for_mutable_tensors(): + return ConstRefCType(BaseCType(tensorT)) + else: + return MutRefCType(BaseCType(tensorT)) + else: + # Note [Tensor Copy Returns] + # Currently, we use "Argument.is_write" to determine + # whether or not Tensor return types should be copies or references. + # If that ever changes, take a look at other locations of this note! + return BaseCType(tensorT) + elif t.name == BaseTy.Scalar: + return BaseCType(scalarT) + elif isinstance(t, ListType): + assert ( + not mutable + ), "Native functions should never return a mutable tensor list. They should return void." + elem = returntype_type(t.elem, mutable=False) + assert t.size is None, f"fixed size list returns not supported: {t}" + return VectorCType(elem) + + raise AssertionError(f"unrecognized return type {t}") + + +# Translation of a single return to its C++ type +def return_type(r: Return) -> CType: + return returntype_type(r.type, mutable=r.is_write) + + +# Translation of a full (possibly multi) return from JIT to its C++ type +def returns_type(rs: Sequence[Return]) -> CType: + if len(rs) == 0: + return BaseCType(voidT) + elif len(rs) == 1: + return return_type(rs[0]) + else: + return TupleCType([return_type(r) for r in rs]) + + +def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]: + returns: list[str] = [] + for i, r in enumerate(f.func.returns): + # If we have an inplace function, the return argument is + # implicitly named self. + # TODO: Consider incorporating this into the data model + if f.func.name.name.inplace: + assert i == 0, "illegal inplace function with multiple returns" + name = "self" + # If we are out function, the name is the name of the + # corresponding output function (r.name will get recorded + # in field_name later.) + elif f.func.is_out_fn(): + name = f.func.arguments.out[i].name + # If the return argument is explicitly named... + elif r.name: + name_conflict = any( + r.name == a.name for a in f.func.schema_order_arguments() + ) + if name_conflict and not f.func.is_out_fn(): + name = f"{r.name}_return" + else: + name = r.name + # If there is no explicit name and no fallback name was passed in, we just name the output result, + # unless it's a multi-return, in which case it's result0, + # result1, etc (zero-indexed) + else: + name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}" + returns.append(name) + return returns + + +JIT_TO_CPP_DEFAULT = { + "False": "false", + "True": "true", + "None": "torch::executorch::nullopt", # UGH this one is type directed + "[]": "{}", + "contiguous_format": "torch::executorch::MemoryFormat::Contiguous", + "long": "torch::executorch::kLong", +} + + +# Convert a JIT default into C++ expression representing the default +def default_expr(d: str, t: Type) -> str: + if d == "None" and str(t) == "Tensor?": + return "{}" + if isinstance(t, BaseType) and t.name is BaseTy.str: + # Schema allows single quotes but C++ needs double + if len(d) >= 2 and d[0] == "'" and d[-1] == "'": + s = "" + i = 1 + while i + 1 < len(d): + if d[i] != "\\": + if d[i] == '"': + s += '\\"' + else: + s += d[i] + i += 1 + else: + if d[i + 1] == "'": + s += "'" + else: + s += d[i : i + 2] + i += 2 + + return f'"{s}"' + + if isinstance(t, OptionalType): + if d == "None": + return "torch::executor::nullopt" + + return default_expr(d, t.elem) + + if isinstance(t, ListType): + if d.startswith("[") and d.endswith("]"): + return "{" + d[1:-1] + "}" + elif t.size is None: + # NOTE: Sized lists can have scalar defaults + raise ValueError(f"Expected a list default '[...]' but found: '{d}'") + + return JIT_TO_CPP_DEFAULT.get(d, d) + + +# Convert an argument into its C++ API form + + +def argument( + a: Argument | TensorOptionsArguments | SelfArgument, + *, + cpp_no_default_args: set[str], + method: bool, + faithful: bool, + has_tensor_options: bool, +) -> list[Binding]: + def sub_argument( + a: Argument | TensorOptionsArguments | SelfArgument, + ) -> list[Binding]: + return argument( + a, + cpp_no_default_args=cpp_no_default_args, + method=method, + faithful=faithful, + has_tensor_options=has_tensor_options, + ) + + if isinstance(a, Argument): + binds: ArgName + if a.name == "memory_format" and has_tensor_options: + binds = SpecialArgName.possibly_redundant_memory_format + else: + binds = a.name + default: str | None = None + if a.name not in cpp_no_default_args and a.default is not None: + default = default_expr(a.default, a.type) + return [ + Binding( + nctype=argument_type(a, binds=binds), + name=a.name, + default=default, + argument=a, + ) + ] + elif isinstance(a, TensorOptionsArguments): + raise NotImplementedError("Need to implement type resolution for TensorOptions") + elif isinstance(a, SelfArgument): + if method: + # Caller is responsible for installing implicit this in context! + return [] + else: + return sub_argument(a.argument) + else: + assert_never(a) + + +def arguments( + arguments: Arguments, + *, + faithful: bool, + method: bool, + cpp_no_default_args: set[str], +) -> list[Binding]: + args: list[Argument | TensorOptionsArguments | SelfArgument] = [] + if faithful: + args.extend(arguments.non_out) + args.extend(arguments.out) + else: + args.extend(arguments.out) + args.extend(arguments.non_out) + return [ + r.no_default() if faithful else r + for a in args + for r in argument( + a, + faithful=faithful, + method=method, + has_tensor_options=arguments.tensor_options is not None, + cpp_no_default_args=cpp_no_default_args, + ) + ] diff --git a/torchgen/executorch/api/types/__init__.py b/torchgen/executorch/api/types/__init__.py new file mode 100644 index 00000000000..08cb168df73 --- /dev/null +++ b/torchgen/executorch/api/types/__init__.py @@ -0,0 +1,4 @@ +from torchgen.executorch.api.types.types import * + + +from torchgen.executorch.api.types.signatures import * # usort: skip diff --git a/torchgen/executorch/api/types/signatures.py b/torchgen/executorch/api/types/signatures.py new file mode 100644 index 00000000000..ac3477cede6 --- /dev/null +++ b/torchgen/executorch/api/types/signatures.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import torchgen.api.cpp as aten_cpp +from torchgen.executorch.api.types.types import contextArg + + +if TYPE_CHECKING: + from torchgen.api.types import Binding, CType + from torchgen.model import FunctionSchema, NativeFunction + + +@dataclass(frozen=True) +class ExecutorchCppSignature: + """ + This signature is merely a CppSignature with Executorch types (optionally + contains KernelRuntimeContext as well). The inline definition of + CppSignature is generated in Functions.h and it's used by unboxing + functions. + """ + + # The schema this signature is derived from + func: FunctionSchema + + # The set of C++ arguments which should not have defaults applied to them + cpp_no_default_args: set[str] + + # Allows you to prepend an arbitrary prefix to the signature name. + # This is useful for parts of the codegen that generate wrappers around kernels, + # and need to avoid naming collisions. + prefix: str = "" + + def arguments(self, *, include_context: bool = True) -> list[Binding]: + return ([contextArg] if include_context else []) + et_cpp.arguments( + self.func.arguments, + faithful=True, # always faithful, out argument at the end + method=False, # method not supported + cpp_no_default_args=self.cpp_no_default_args, + ) + + def name(self) -> str: + return self.prefix + aten_cpp.name( + self.func, + faithful_name_for_out_overloads=True, + ) + + def decl(self, name: str | None = None, *, include_context: bool = True) -> str: + args_str = ", ".join( + a.decl() for a in self.arguments(include_context=include_context) + ) + if name is None: + name = self.name() + return f"{self.returns_type().cpp_type()} {name}({args_str})" + + def defn(self, name: str | None = None) -> str: + args = [a.defn() for a in self.arguments()] + args_str = ", ".join(args) + if name is None: + name = self.name() + return f"{self.returns_type().cpp_type()} {name}({args_str})" + + def returns_type(self) -> CType: + return et_cpp.returns_type(self.func.returns) + + @staticmethod + def from_native_function( + f: NativeFunction, *, prefix: str = "" + ) -> ExecutorchCppSignature: + return ExecutorchCppSignature( + func=f.func, prefix=prefix, cpp_no_default_args=f.cpp_no_default_args + ) + + +from torchgen.executorch.api import et_cpp diff --git a/torchgen/executorch/api/types/types.py b/torchgen/executorch/api/types/types.py new file mode 100644 index 00000000000..b3a960a8246 --- /dev/null +++ b/torchgen/executorch/api/types/types.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from torchgen.api.types import ( + BaseCppType, + BaseCType, + Binding, + boolT, + CType, + doubleT, + Expr, + longT, + MutRefCType, + NamedCType, +) +from torchgen.model import BaseTy + + +halfT = BaseCppType("torch::executor", "Half") +bfloat16T = BaseCppType("torch::executor", "BFloat16") +stringT = BaseCppType("torch::executor", "string_view") +scalarTypeT = BaseCppType("torch::executor", "ScalarType") +tensorT = BaseCppType("torch::executor", "Tensor") +tensorListT = BaseCppType("torch::executor", "TensorList") +scalarT = BaseCppType("torch::executor", "Scalar") +memoryFormatT = BaseCppType("torch::executor", "MemoryFormat") +intArrayRefT = BaseCppType("torch::executor", "IntArrayRef") +optionalT = BaseCppType("torch::executor", "optional") +contextT = BaseCppType("torch::executor", "KernelRuntimeContext") + +contextExpr = Expr( + expr="context", + type=NamedCType(name="context", type=MutRefCType(BaseCType(contextT))), +) + +contextArg = Binding( + name="context", + nctype=contextExpr.type, + argument=None, # type: ignore[arg-type] + default=None, +) + +BaseTypeToCppMapping: dict[BaseTy, BaseCppType] = { + BaseTy.int: longT, + BaseTy.float: doubleT, + BaseTy.bool: boolT, + BaseTy.str: stringT, + BaseTy.ScalarType: scalarTypeT, + BaseTy.Tensor: tensorT, + BaseTy.Scalar: scalarT, + BaseTy.MemoryFormat: memoryFormatT, +} + + +@dataclass(frozen=True) +class OptionalCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f"torch::executor::optional<{self.elem.cpp_type()}>" + + def cpp_type_registration_declarations(self) -> str: + return f"torch::executor::optional<{self.elem.cpp_type_registration_declarations()}>" + + def remove_const_ref(self) -> CType: + return OptionalCType(self.elem.remove_const_ref()) + + +@dataclass(frozen=True) +class ArrayRefCType(CType): + elem: CType + + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. + return f"torch::executor::ArrayRef<{self.elem.cpp_type()}>" + + def cpp_type_registration_declarations(self) -> str: + return f"torch::executor::ArrayRef<{self.elem.cpp_type_registration_declarations()}>" + + def remove_const_ref(self) -> CType: + return ArrayRefCType(self.elem.remove_const_ref()) diff --git a/torchgen/executorch/api/unboxing.py b/torchgen/executorch/api/unboxing.py new file mode 100644 index 00000000000..6845e72a22a --- /dev/null +++ b/torchgen/executorch/api/unboxing.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable, Sequence, TYPE_CHECKING + +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + ListType, + NativeFunction, + OptionalType, + Type, +) + + +if TYPE_CHECKING: + from torchgen.api.types import Binding, CType, NamedCType + + +connector = "\n\t" + + +# Return unboxing function name for a NativeFunction +def name(f: NativeFunction) -> str: + return f.func.name.unambiguous_name() + + +@dataclass(frozen=True) +class Unboxing: + """ + Takes a sequence of Bindings and unbox EValues to these Bindings. Return generated code that performs correct unboxing. + A sample generated code: + // aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + void mul_out(EValue** stack) { + EValue& self = *stack[0]; + EValue& other = *stack[1]; + EValue& out = *stack[2]; + const torch::executor::Tensor & self_base = self.to(); + const torch::executor::Tensor & other_base = other.to(); + torch::executor::Tensor & out_base = out.to(); + + EXECUTORCH_SCOPE_PROF("native_call_mul.out"); + torch::executor::mul_outf(self_base, other_base, out_base); + + + } + """ + + # this is a callable that converts a JIT argument, into its C++ type. + # Translates (type, mutability, binds) to NamedCType. E.g., torchgen.api.cpp.argumenttype_type. + argument_type_gen: Callable[ + ..., + NamedCType, + ] + + # Convert all the arguments in a NativeFunction to C++ code + def convert_arguments( + self, args: Sequence[Binding] + ) -> tuple[list[Binding], list[str]]: + code_list = [f"EValue& {args[i].name} = *stack[{i}];" for i in range(len(args))] + binding_list = [] + for arg in args: + # expecting only Argument + if not isinstance(arg.argument, Argument): + raise Exception( # noqa: TRY002 + f"Unexpected argument type, expecting `Argument` but got {arg}" + ) + argument: Argument = arg.argument + unboxed_name, _, code, decl = self.argumenttype_evalue_convert( + argument.type, argument.name, mutable=argument.is_write + ) + code_list.extend(decl) + code_list.extend(code) + binding_list.append(arg.with_name(unboxed_name)) + return binding_list, code_list + + def argumenttype_evalue_convert( + self, t: Type, arg_name: str, *, mutable: bool = False + ) -> tuple[str, CType, list[str], list[str]]: + """ + Takes in the type, name and mutability corresponding to an argument, and generates a tuple of: + (1) the C++ code necessary to unbox the argument + (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType + :param t: a `Type` of an argument + :param arg_name: argument name + :param mutable: boolean for whether this argument type is mutable + :return: unboxed result + """ + ctype = self.argument_type_gen(t, mutable=mutable, binds=arg_name).type + + if isinstance(t, BaseType): + out_name = f"{arg_name}_base" + code, decl = self._gen_code_base_type( + arg_name=arg_name, out_name=out_name, ctype=ctype + ) + elif isinstance(t, OptionalType): + out_name = f"{arg_name}_opt_out" + code, decl = self._gen_code_optional_type( + arg_name=arg_name, out_name=out_name, t=t, ctype=ctype + ) + elif isinstance(t, ListType): + out_name = f"{arg_name}_list_out" + code, decl = self._gen_code_list_type( + arg_name=arg_name, out_name=out_name, t=t, ctype=ctype + ) + else: + raise Exception( # noqa: TRY002 + f"Cannot handle type {t}. arg_name: {arg_name}" + ) # noqa: TRY002 + return out_name, ctype, code, decl + + def _gen_code_base_type( + self, arg_name: str, out_name: str, ctype: CType + ) -> tuple[list[str], list[str]]: + return [ + f"{ctype.cpp_type()} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();" + ], [] + + def _gen_code_optional_type( + self, arg_name: str, out_name: str, t: OptionalType, ctype: CType + ) -> tuple[list[str], list[str]]: + in_name = f"{arg_name}_opt_in" + res_name, base_type, res_code, decl = self.argumenttype_evalue_convert( + t.elem, in_name + ) + return ( + f""" + auto {out_name} = {arg_name}.toOptional<{base_type.cpp_type(strip_ref=True)}>(); + """.split( + "\n" + ), + decl, + ) + + def _gen_code_list_type( + self, arg_name: str, out_name: str, t: ListType, ctype: CType + ) -> tuple[list[str], list[str]]: + in_name = f"{arg_name}_list_in" + elem_name = f"{arg_name}_elem" + code = [] + res_name, res_ctype, res_code, decl = self.argumenttype_evalue_convert( + t.elem, elem_name + ) + + if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.Tensor: + code.extend( + f""" + auto {out_name} = {arg_name}.toTensorList(); + """.split( + "\n" + ) + ) + elif isinstance(t.elem, BaseType) and ( + t.elem.name == BaseTy.int or t.elem.name == BaseTy.SymInt + ): + code.extend( + f""" + auto {out_name} = {arg_name}.toIntList(); + """.split( + "\n" + ) + ) + elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.float: + code.extend( + f""" + auto {out_name} = {arg_name}.toDoubleList(); + """.split( + "\n" + ) + ) + elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool: + # handle list type with size, e.g., bool[4] + code.extend( + f""" +#ifdef USE_ATEN_LIB +std::array {out_name}; +auto {in_name} = {arg_name}.toBoolList(); +size_t _i = 0; +for (auto {elem_name}: {in_name}) {{ + {out_name}[_i++] = {elem_name}; +}} +#else +auto {out_name} = {arg_name}.toBoolList(); +#endif + """.split( + "\n" + ) + ) + # pytorch codegen: + # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<::std::optional> + elif ( + isinstance(t.elem, OptionalType) + and isinstance(t.elem.elem, BaseType) + and t.elem.elem.name == BaseTy.Tensor + ): + code.extend( + f""" +#ifdef USE_ATEN_LIB +auto {in_name} = {arg_name}.toListOptionalTensor(); +c10::List<::std::optional> {out_name}; +for (auto {elem_name}: {in_name}) {{ + {out_name}.push_back({elem_name}); +}} +#else +auto {out_name} = {arg_name}.toListOptionalTensor(); +#endif + """.split( + "\n" + ) + ) + else: + # use ArrayRef as default. + vec_name = arg_name + "_vec" + # need to bring vector instantiation out of scope so that ArrayRef has valid data + decl.append( + f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};" + ) + code.extend( + f""" + for (EValue {elem_name}: {in_name}) {{ + {connector.join(res_code)} + {vec_name}.push_back({res_name}); + }} + {ctype.cpp_type(strip_ref=True)} {out_name}({vec_name}); + """.split( + "\n" + ) + ) + return code, decl diff --git a/torchgen/executorch/model.py b/torchgen/executorch/model.py new file mode 100644 index 00000000000..6aadfe41dae --- /dev/null +++ b/torchgen/executorch/model.py @@ -0,0 +1,220 @@ +# Represents all kernels used by an Executorch model. +# It maintains a Dict[OperatorName, Dict[ETKernelKey, BackendMetadata]] structure. + +from __future__ import annotations + +import itertools +from collections import defaultdict, namedtuple +from dataclasses import dataclass +from enum import IntEnum + +from torchgen.model import ( + BackendIndex, + BackendMetadata, + DispatchKey, + NativeFunction, + NativeFunctionsGroup, + OperatorName, +) +from torchgen.utils import assert_never + + +KERNEL_KEY_VERSION = 1 + + +# TODO: Duplicated Subset from codegen.tool.gen_oplist, remove declaration in codegen +class ScalarType(IntEnum): + Byte = 0 + Char = 1 + Short = 2 + Int = 3 + Long = 4 + Float = 6 + Double = 7 + Bool = 11 + + +ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "kernel_index"]) + + +@dataclass(frozen=True) +class ETKernelKeyOpArgMeta: + arg_name: str + dtype: str + # The order of the dimensions if entry is a Tensor + dim_order: tuple[int, ...] + + def to_native_string(self) -> str: + dtype_str = ScalarType[self.dtype].value + dim_str = str(self.dim_order)[1:-1].replace(" ", "") + return f"{dtype_str};{dim_str}" + + +@dataclass(frozen=True) +class ETKernelKey: + # Field undefined is default = True + arg_meta: tuple[ETKernelKeyOpArgMeta, ...] = () + + # Indicator for this kernel being used as a catch all + default: bool = False + + version: int = KERNEL_KEY_VERSION + + @staticmethod + def gen_from_yaml( + args: dict[str, tuple[str, str]], + type_alias_map: dict[str, list[str]], # TODO: Support unwrapped str val + dim_order_alias_map: dict[str, list[int]], + ) -> list[ETKernelKey]: + """Generate ETKernelKeys from arg kernel specs + Multiple ETKernelKeys are returned due to dtype permutations from utilizing + type_alias_map (actualizing each potential type permutation as a KernelKey) + + Args: + args: Mapping from argument name to kernel specs + Kernel specs are a tuple of (dtype, dim_order). + Currently tuple entries must be aliased via the alias map arguments + type_alias_map: Mapping from type alias to potential type enums + i.e { T0 : [Double, Int] } means T0 can be either Double or Int + Used for lookup by args + dim_order_alias_map: Mapping from alias to a list of dimension orders + Used for lookup by args + """ + # Cast to dim order to int + dim_order_alias_map = { + k: [int(alias) for alias in v] for k, v in dim_order_alias_map.items() + } + kernel_keys = [] + + # Get all used Dtype Alias + dtype_alias_used = set() + for type_alias, dim_order in args.values(): + # Enforce usage of alias initially + # TODO: Support inlined arguments + assert type_alias in type_alias_map, "Undefined type alias: " + str( + type_alias + ) + assert ( + dim_order in dim_order_alias_map + ), "Undefined dim_order alias: " + str(dim_order) + dtype_alias_used.add(type_alias) + + # Generate all permutations of dtype alias values + alias_dtypes = [ + [(alias, dtype) for dtype in type_alias_map[alias]] + for alias in dtype_alias_used + ] + alias_permutations = [ + dict(permutation) for permutation in list(itertools.product(*alias_dtypes)) + ] + + # Using each alias value permutation, generate kernel keys + op_arg_cache = {} + for permutation in alias_permutations: + arg_list = [] + for arg_name, arg_spec in args.items(): + dtype = permutation[arg_spec[0]] + dim_order = dim_order_alias_map[arg_spec[1]] # type: ignore[assignment] + if ( + cache_key := (arg_name, dtype, tuple(dim_order)) + ) not in op_arg_cache: + op_arg_cache[cache_key] = ETKernelKeyOpArgMeta(*cache_key) # type: ignore[arg-type] + + arg_list.append(op_arg_cache[cache_key]) + kernel_keys.append(ETKernelKey(tuple(arg_list))) + + return kernel_keys + + def to_native_string(self) -> str: + if self.default: + return "default" + return ( + "v" + + str(KERNEL_KEY_VERSION) + + "/" + + "|".join([arg.to_native_string() for arg in self.arg_meta]) + ) + + +@dataclass(frozen=True) +class ETKernelIndex: + index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]] + + def has_kernels(self, g: NativeFunction | NativeFunctionsGroup) -> bool: + m = self.get_kernels(g) + return m is not None + + def get_kernels( + self, g: NativeFunction | NativeFunctionsGroup + ) -> dict[ETKernelKey, BackendMetadata]: + if isinstance(g, NativeFunction): + f = g + elif isinstance(g, NativeFunctionsGroup): + f = g.functional + else: + assert_never(g) + if f.func.name not in self.index: + return {} + return self.index[f.func.name] + + @staticmethod + def grow_from_backend_indices( + kernel_index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]], + backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]], + ) -> None: + for dk in backend_indices: + index = backend_indices[dk] + for op, backend_metadata in index.items(): + if op in kernel_index: + kernel_index[op][ETKernelKey(default=True)] = backend_metadata + else: + kernel_index[op] = {ETKernelKey(default=True): backend_metadata} + + @staticmethod + def from_backend_indices( + backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]] + ) -> ETKernelIndex: + kernel_index: dict[ + OperatorName, dict[ETKernelKey, BackendMetadata] + ] = defaultdict(dict) + ETKernelIndex.grow_from_backend_indices(kernel_index, backend_indices) + return ETKernelIndex(kernel_index) + + def grow( + self, backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]] + ) -> ETKernelIndex: + ETKernelIndex.grow_from_backend_indices(self.index, backend_indices) + return self + + def _to_backend_index(self) -> BackendIndex: + """ + WARNING: this will be deprecated once all the codegen places know how to handle ETKernelIndex. + """ + index: dict[OperatorName, BackendMetadata] = {} + for op in self.index: + kernel_dict = self.index[op] + assert ( + len(kernel_dict.values()) == 1 + ), f"Can't convert ETKernelIndex to BackendIndex because {op} has more than one kernels. Got {kernel_dict}" + index[op] = kernel_dict.get( + ETKernelKey(default=True), + BackendMetadata(kernel="", structured=False, cpp_namespace=""), + ) + return BackendIndex( + dispatch_key=DispatchKey.CPU, + use_out_as_primary=False, + device_guard=False, + external=False, + index=index, + ) + + # Note duplicate ETKernelKey from index_b will clobber the metadata from index_a + @staticmethod + def merge_indices(index_a: ETKernelIndex, index_b: ETKernelIndex) -> ETKernelIndex: + combined = defaultdict(dict, index_a.index.copy()) + + for op, entry in index_b.index.items(): + for key, metadata in entry.items(): + combined[op][key] = metadata + + return ETKernelIndex(combined) diff --git a/torchgen/executorch/parse.py b/torchgen/executorch/parse.py new file mode 100644 index 00000000000..8095abd5b6b --- /dev/null +++ b/torchgen/executorch/parse.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from collections import defaultdict, namedtuple +from typing import Any + +import yaml + +from torchgen.executorch.model import ETKernelIndex, ETKernelKey +from torchgen.gen import LineLoader, parse_native_yaml +from torchgen.model import ( + BackendMetadata, + DispatchKey, + FunctionSchema, + NativeFunction, + OperatorName, +) +from torchgen.utils import NamespaceHelper + + +# Parse native_functions.yaml into a sequence of NativeFunctions and ET Backend Indices. +ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "et_kernel_indices"]) + +# Fields in native_functions.yaml used to determine which kernels should be used +ET_FIELDS = ["kernels", "type_alias", "dim_order_alias"] + + +def parse_from_yaml(ei: dict[str, object]) -> dict[ETKernelKey, BackendMetadata]: + """Given a loaded yaml representing kernel assignment information, extract the + mapping from `kernel keys` to `BackendMetadata` (the latter representing the kernel instance) + + Args: + ei: Dict keys {kernels, type_alias, dim_order_alias} + See ETKernelKey for description of arguments + """ + e = ei.copy() + if (kernels := e.pop("kernels", None)) is None: + return {} + + type_alias: dict[str, list[str]] = e.pop("type_alias", {}) # type: ignore[assignment] + dim_order_alias: dict[str, list[str]] = e.pop("dim_order_alias", {}) # type: ignore[assignment] + dim_order_alias.pop("__line__", None) + + kernel_mapping: dict[ETKernelKey, BackendMetadata] = {} + + for entry in kernels: # type: ignore[attr-defined] + arg_meta = entry.get("arg_meta") + if arg_meta is not None: + arg_meta.pop("__line__") + + kernel_name = entry.get("kernel_name") + namespace_helper = NamespaceHelper.from_namespaced_entity( + kernel_name, max_level=3 + ) + kernel_namespace = namespace_helper.get_cpp_namespace(default="at") + backend_metadata = BackendMetadata( + kernel=namespace_helper.entity_name, + structured=False, + cpp_namespace=(kernel_namespace + "::native"), + ) + + kernel_keys = ( + [ETKernelKey((), default=True)] + if arg_meta is None + else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias) # type: ignore[arg-type] + ) + + for kernel_key in kernel_keys: + assert kernel_key not in kernel_mapping, ( + "Duplicate kernel key: " + str(kernel_key) + " " + str(e) + ) + kernel_mapping[kernel_key] = backend_metadata + + return kernel_mapping + + +def parse_et_yaml_struct(es: object) -> ETKernelIndex: + """Given a loaded yaml representing a list of operators, for each op extract the mapping + of `kernel keys` to `BackendMetadata` (the latter representing the kernel instance + that should be used by the kernel key). + """ + indices: dict[OperatorName, dict[ETKernelKey, BackendMetadata]] = {} + for ei in es: # type: ignore[attr-defined] + e = ei.copy() + + funcs = e.pop("func") + assert isinstance(funcs, str), f"not a str: {funcs}" + namespace_helper = NamespaceHelper.from_namespaced_entity( + namespaced_entity=funcs, max_level=1 + ) + opname = FunctionSchema.parse(namespace_helper.entity_name).name + + assert opname not in indices, f"Duplicate func found in yaml: {opname} already" + + if len(index := parse_from_yaml(e)) != 0: + indices[opname] = index + + return ETKernelIndex(indices) + + +def extract_kernel_fields(es: object) -> dict[OperatorName, dict[str, Any]]: + """Given a loaded yaml representing a list of operators, extract the + kernel key related fields indexed by the operator name. + """ + fields: dict[OperatorName, dict[str, Any]] = defaultdict(dict) + for ei in es: # type: ignore[attr-defined] + funcs = ei.get("func") + assert isinstance(funcs, str), f"not a str: {funcs}" + namespace_helper = NamespaceHelper.from_namespaced_entity( + namespaced_entity=funcs, max_level=1 + ) + opname = FunctionSchema.parse(namespace_helper.entity_name).name + + for field in ET_FIELDS: + if (value := ei.get(field)) is not None: + fields[opname][field] = value + + return fields + + +def parse_et_yaml( + path: str, + tags_yaml_path: str, + ignore_keys: set[DispatchKey] | None = None, + skip_native_fns_gen: bool = False, +) -> tuple[list[NativeFunction], dict[OperatorName, dict[str, Any]]]: + """Parse native_functions.yaml into NativeFunctions and an Operator Indexed Dict + of fields to persist from native_functions.yaml to functions.yaml + """ + with open(path) as f: + es = yaml.load(f, Loader=LineLoader) + + et_kernel = extract_kernel_fields(es) + + # Remove ET specific fields from entries for BC compatibility + strip_et_fields(es) + + native_yaml = parse_native_yaml( + path, + tags_yaml_path, + ignore_keys, + skip_native_fns_gen=skip_native_fns_gen, + loaded_yaml=es, + ) + return native_yaml.native_functions, et_kernel + + +def strip_et_fields(es: object) -> None: + """Given a loaded yaml representing a list of operators, + remove ET specific fields from every entries for BC compatibility + """ + for entry in es: # type: ignore[attr-defined] + for field in ET_FIELDS: + entry.pop(field, None) diff --git a/torchgen/fuse/gen_patterns.py b/torchgen/fuse/gen_patterns.py new file mode 100644 index 00000000000..0861c882e3f --- /dev/null +++ b/torchgen/fuse/gen_patterns.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +import os + +from torch._inductor import pattern_matcher +from torch._inductor.fx_passes import joint_graph + + +if __name__ == "__main__": + # Start by deleting all the existing patterns. + for path in pattern_matcher.SERIALIZED_PATTERN_PATH.iterdir(): + if path.name in {"__init__.py", "__pycache__"}: + continue + if path.is_file(): + path.unlink() + + # Now have joint_graph load all known patterns and tell the pattern matcher + # to serialize the patterns as it goes. + os.environ["PYTORCH_GEN_PATTERNS"] = "1" + joint_graph.lazy_init() diff --git a/torchgen/gen.py b/torchgen/gen.py new file mode 100644 index 00000000000..e5870a24fc6 --- /dev/null +++ b/torchgen/gen.py @@ -0,0 +1,2986 @@ +from __future__ import annotations + +import argparse +import functools +import json +import os +from collections import defaultdict, namedtuple, OrderedDict +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Callable, Literal, Sequence, TypeVar + +import yaml + +import torchgen.api.dispatcher as dispatcher +import torchgen.api.meta as meta +import torchgen.api.native as native +import torchgen.api.structured as structured +import torchgen.dest as dest +from torchgen.aoti.fallback_ops import inductor_fallback_ops +from torchgen.api import cpp +from torchgen.api.translate import translate +from torchgen.api.types import ( + Binding, + CppSignature, + CppSignatureGroup, + DispatcherSignature, + NamedCType, + NativeSignature, + SpecialArgName, +) +from torchgen.context import ( + method_with_native_function, + native_function_manager, + with_native_function, + with_native_function_and_indices, +) +from torchgen.gen_aoti_c_shim import ( + gen_aoti_c_shim, + gen_static_dispatch_backend_call_signature, + get_fallback_op_name, + get_header_for_aoti, +) +from torchgen.gen_functionalization_type import ( + gen_functionalization_definition, + gen_functionalization_registration, + gen_functionalization_view_inverse_declaration, + GenCompositeViewCopyKernel, +) +from torchgen.gen_vmap_plumbing import gen_all_vmap_plumbing +from torchgen.model import ( + Argument, + BackendIndex, + BackendMetadata, + BaseOperatorName, + DEFAULT_KERNEL_NAMESPACE, + DispatchKey, + FRAGMENT_NAMESPACES, + FunctionSchema, + is_cuda_dispatch_key, + is_generic_dispatch_key, + is_ufunc_dispatch_key, + is_xpu_dispatch_key, + Location, + NativeFunction, + NativeFunctionsGroup, + NativeFunctionsViewGroup, + OperatorName, + OptionalType, + SchemaKind, + SelfArgument, + STRUCTURED_DISPATCH_KEYS, + TensorOptionsArguments, + Type, + Variant, + ViewSchemaKind, +) +from torchgen.native_function_generation import ( + add_generated_native_functions, + gen_composite_functional_kernel, + gen_composite_out_kernel, + pre_group_native_functions, +) +from torchgen.selective_build.selector import SelectiveBuilder +from torchgen.utils import ( + assert_never, + concatMap, + context, + FileManager, + make_file_manager, + mapMaybe, + NamespaceHelper, + Target, +) +from torchgen.yaml_utils import YamlDumper, YamlLoader + + +T = TypeVar("T") + +# Welcome to the ATen code generator v2! The ATen code generator is +# responsible for parsing native_functions.yaml and then generating +# various generated files (e.g., TypeDefault.cpp) based on the operators +# defined in this file. This means that the code generator knows how to +# parse function schema, and then translate this into various C++ types +# and boilerplate code. +# +# Some things to know about this file when you modify it: +# +# - This file has STRICT mypy typechecking. Typecheck it with +# `mypy --config mypy-strict.ini` in the root source directory +# +# - Most of the heavy lifting lives in external modules: +# - 'model' has the data model for native_functions.yaml. The classes +# in those file represent what you see when you look at +# a native_functions.yaml +# - 'api' has conversions for how to translate JIT schema into +# the various C++ APIs that the codegen interacts with. There +# are in fact THREE different C++ APIs: the public C++ API, +# the dispatcher API, and the legacy dispatcher API. See each +# of these respective files for more information + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# HELPER FUNCTIONS +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +# A custom loader for YAML to let us also keep track of line numbers +# of each entry in the YAML file +class LineLoader(YamlLoader): + def construct_mapping(self, node, deep=False): # type: ignore[no-untyped-def] + mapping = super().construct_mapping(node, deep=deep) # type: ignore[no-untyped-call] + # Add 1 so line numbering starts at 1 + mapping["__line__"] = node.start_mark.line + 1 + return mapping + + +# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices. +ParsedYaml = namedtuple("ParsedYaml", ["native_functions", "backend_indices"]) + + +_GLOBAL_PARSE_NATIVE_YAML_CACHE: dict[str, ParsedYaml] = {} +_GLOBAL_PARSE_TAGS_YAML_CACHE: dict[str, set[str]] = {} + + +def parse_native_yaml_struct( + es: object, + valid_tags: set[str], + ignore_keys: set[DispatchKey] | None = None, + path: str = "", + skip_native_fns_gen: bool = False, +) -> ParsedYaml: + assert isinstance(es, list) + rs: list[NativeFunction] = [] + bs: dict[DispatchKey, dict[OperatorName, BackendMetadata]] = defaultdict(dict) + for e in es: + assert isinstance(e, dict), f"expected to be dict: {e}" + assert isinstance(e.get("__line__"), int), e + loc = Location(path, e["__line__"]) + funcs = e.get("func") + assert funcs is not None, f"missed 'func' in {e}" + with context(lambda: f"in {loc}:\n {funcs}"): + func, m = NativeFunction.from_yaml(e, loc, valid_tags, ignore_keys) + rs.append(func) + BackendIndex.grow_index(bs, m) + error_check_native_functions(rs) + # Default dict is to prevent the codegen from barfing when we have a dispatch key that has no kernels yet. + indices: dict[DispatchKey, BackendIndex] = defaultdict( + lambda: BackendIndex( + dispatch_key=DispatchKey.Undefined, + use_out_as_primary=True, + external=False, + device_guard=False, + # I'm actually not sure about this; undefined could be hit on + # empty TensorList, hypothetically that could have sizes in it + index={}, + ) + ) + if not skip_native_fns_gen: + add_generated_native_functions(rs, bs) + for k, v in bs.items(): + # All structured in-tree operators are implemented in terms of their out operator. + indices[k] = BackendIndex( + dispatch_key=k, + use_out_as_primary=True, + external=False, + # Only cuda-like devices in tree require device guards + device_guard=is_cuda_dispatch_key(k) or is_xpu_dispatch_key(k), + index=v, + ) + return ParsedYaml(rs, indices) + + +def parse_tags_yaml_struct(es: object, path: str = "") -> set[str]: + assert isinstance(es, list) + rs: set[str] = set() + for e in es: + assert isinstance(e.get("__line__"), int), e + loc = Location(path, e["__line__"]) + tags = e.get("tag") + with context(lambda: f"in {loc}:\n {tags}"): + e_i = e.copy() + name = e_i.pop("tag") + desc = e_i.pop("desc", "") + # ensure that each tag has a non-empty description + assert desc != "" + rs.add(name) + return rs + + +@functools.lru_cache(maxsize=None) +def parse_tags_yaml(path: str) -> set[str]: + global _GLOBAL_PARSE_TAGS_YAML_CACHE + if path not in _GLOBAL_PARSE_TAGS_YAML_CACHE: + with open(path) as f: + es = yaml.load(f, Loader=LineLoader) + _GLOBAL_PARSE_TAGS_YAML_CACHE[path] = parse_tags_yaml_struct(es, path=path) + + return _GLOBAL_PARSE_TAGS_YAML_CACHE[path] + + +def parse_native_yaml( + path: str, + tags_yaml_path: str, + ignore_keys: set[DispatchKey] | None = None, + *, + skip_native_fns_gen: bool = False, + loaded_yaml: object | None = None, +) -> ParsedYaml: + global _GLOBAL_PARSE_NATIVE_YAML_CACHE + if path not in _GLOBAL_PARSE_NATIVE_YAML_CACHE: + valid_tags = parse_tags_yaml(tags_yaml_path) + + # if a loaded yaml is provided, use that instead of reading from path + if loaded_yaml is None: + with open(path) as f: + es = yaml.load(f, Loader=LineLoader) + else: + es = loaded_yaml + + _GLOBAL_PARSE_NATIVE_YAML_CACHE[path] = parse_native_yaml_struct( + es, + valid_tags, + ignore_keys, + path=path, + skip_native_fns_gen=skip_native_fns_gen, + ) + + return _GLOBAL_PARSE_NATIVE_YAML_CACHE[path] + + +# Some assertions are already performed during parsing, but those are only within a single NativeFunction. +# Assertions here are meant to be performed across NativeFunctions. +def error_check_native_functions(funcs: Sequence[NativeFunction]) -> None: + func_map: dict[OperatorName, NativeFunction] = {} + base_func_map: dict[BaseOperatorName, list[NativeFunction]] = defaultdict(list) + for f in funcs: + func_map[f.func.name] = f + base_func_map[f.func.name.name].append(f) + for f in funcs: + if f.structured_delegate is not None: + delegate_func = func_map.get(f.structured_delegate) + assert delegate_func is not None, ( + f"{f.func.name} is marked as a structured_delegate pointing to " + f"{f.structured_delegate}, but {f.structured_delegate} is missing." + ) + assert delegate_func.structured, ( + f"{f.func.name} is marked as a structured_delegate pointing to " + f"{f.structured_delegate}, but {f.structured_delegate} is not marked as structured. " + f"Consider adding 'structured=True' to the delegated operator" + ) + # See Note [resize_ in Functionalization] + # resize_() is technically an inplace view op (and therefore needs the tag), + # but it would be overkill to add a true "view" variant of resize. + # Instead, resize_() gets special treatment in functionalization, + # and we have a resize() op that is non-aliasing + functional. + if ( + "inplace_view" in f.tags + and str(f.func.name) != "resize_" + and str(f.func.name) != "resize_as_" + and str(f.func.name.name) != "set_" + ): + base_name = f.func.name.name + assert base_name.inplace, ( + f"{f.func.name} is marked with tag: inplace_view, but it doesn't follow the naming " + "convention for inplace ops - the codegen expects the base name to have a trailing underscore. " + ) + out_of_place_base_name = BaseOperatorName( + base_name.base, False, base_name.dunder_method + ) + assert len(base_func_map[out_of_place_base_name]) > 0, ( + f"{f.func.name} is marked with tag: inplace_view. The codegen expects there to be a corresponding " + f"out-of-place view op with the name '{base_name}' and matching schema, but it didn't find one. " + ) + + +def cpp_string(s: str) -> str: + """Convert a python string into a c++ string literal""" + s = s.replace("\\", "\\\\") + s = s.replace('"', '\\"') + s = s.replace("\a", "\\a") + s = s.replace("\b", "\\b") + s = s.replace("\f", "\\f") + s = s.replace("\n", "\\n") + s = s.replace("\v", "\\v") + s = s.replace("\t", "\\t") + return f'"{s}"' + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# C++ CODE GENERATION +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + +# Most functions in this section are curried: they consist of a function +# that takes some parameters (e.g., what is to be generated) which itself +# returns a function that actually maps NativeFunction to the code +# to be generated. This pattern makes it convenient to use map, concatMap +# and similar functional combinators. + + +def static_dispatch_keys(backends: list[BackendIndex]) -> list[DispatchKey]: + if len(backends) == 0: + return [] + else: + return [backend.dispatch_key for backend in backends] + [ + DispatchKey.CompositeImplicitAutograd, + DispatchKey.CompositeImplicitAutogradNestedTensor, + DispatchKey.CompositeExplicitAutograd, + DispatchKey.CompositeExplicitAutogradNonFunctional, + ] + + +def get_static_dispatch_backend( + f: NativeFunction, backend_index: BackendIndex +) -> DispatchKey | None: + if f.structured_delegate is not None or backend_index.has_kernel(f): + # TODO: for ops with structured_delegate it should check the dispatch table of + # the out variant instead. For now, these structured ops all have CPU/CUDA kernels + # so we always dispatch to the `backend`, but this could be wrong when we + # migrate math/default_backend ops to use structured delegate. + return backend_index.dispatch_key + elif f.has_composite_explicit_autograd_kernel: + return DispatchKey.CompositeExplicitAutograd + elif f.has_composite_explicit_autograd_non_functional_kernel: + return DispatchKey.CompositeExplicitAutogradNonFunctional + elif f.has_composite_implicit_autograd_kernel: + return DispatchKey.CompositeImplicitAutograd + elif f.has_composite_implicit_autograd_nested_tensor_kernel: + return DispatchKey.CompositeImplicitAutogradNestedTensor + return None + + +def static_dispatch_ops_header( + f: NativeFunction, backend_index: list[BackendIndex] +) -> str | None: + if backend_index is None or f.manual_kernel_registration: + return None + + output = [] + for index in backend_index: + dispatch_key = get_static_dispatch_backend(f, index) + if dispatch_key is not None: + output.append( + f"#include " + ) + return "\n".join(output) + + +def static_dispatch_extra_headers(backends: list[BackendIndex]) -> list[str]: + return [ + f"#include " + for dispatch_key in static_dispatch_keys(backends) + ] + + +# Translates arguments of `sig` to CppSignature bindings. +# Note that we have a special case for `memory_format` argument and this case is not covered by +# tools.codegen.api.translate() yet as its application is limited to static dispatch. +def translate_args( + sig: CppSignature | DispatcherSignature, + cpp_sig: CppSignature, +) -> str: + # Adds SpecialArgName.possibly_redundant_memory_format NamedCType for memory_format bindings + def add_spl_memory_format_binding(input_bindings: list[Binding]) -> list[Binding]: + output_bindings: list[Binding] = [] + for binding in input_bindings: + if binding.name == "memory_format": + spl_mem_format_binding = Binding( + nctype=NamedCType( + SpecialArgName.possibly_redundant_memory_format, + binding.nctype.type, + ), + name=binding.name, + default=binding.default, + argument=binding.argument, + ) + output_bindings.append(spl_mem_format_binding) + else: + output_bindings.append(binding) + return output_bindings + + src_bindings = list(sig.arguments()) + goal_bindings = list(cpp_sig.arguments()) + # When last argument of CPP signature has SpecialArgName.possibly_redundant_memory_format NCType, + # get memory_format bindings of dispatcher signature to have the same NCType as well + for arg in goal_bindings: + if arg.nctype.name == SpecialArgName.possibly_redundant_memory_format: + src_bindings = add_spl_memory_format_binding(src_bindings) + break + exprs = translate(src_bindings, goal_bindings) + return ", ".join(a.expr for a in exprs) + + +def generate_static_dispatch_backend_call( + sig: CppSignature | DispatcherSignature, + f: NativeFunction, + backend_index: BackendIndex, +) -> str: + cpp_sig = gen_static_dispatch_backend_call_signature(sig, f) + name = cpp_sig.name() + exprs = translate_args(sig, cpp_sig) + backend_metadata = backend_index.get_kernel(f) + kernel_ns = ( + backend_metadata.cpp_namespace + if backend_metadata and backend_metadata.cpp_namespace + else DEFAULT_KERNEL_NAMESPACE + ) + ns = kernel_ns.replace("::native", "") + return f"return {ns}::{backend_index.dispatch_key.lower()}::{name}({exprs});" + + +def generate_static_dispatch_fallback_call( + sig: CppSignature | DispatcherSignature, + f: NativeFunction, + backend_indices: list[BackendIndex], +) -> str: + cpp_sigs = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=False + ) + if sig.symint and f.func.has_symint(): + cpp_sig = cpp_sigs.symint_signature + else: + cpp_sig = cpp_sigs.signature + assert cpp_sig is not None + name = cpp_sig.name() + exprs = translate_args(sig, cpp_sig) + ns = DEFAULT_KERNEL_NAMESPACE.replace("::native", "") + if f.has_composite_explicit_autograd_kernel: + return f"return {ns}::{DispatchKey.CompositeExplicitAutograd.lower()}::{name}({exprs});" + elif f.has_composite_explicit_autograd_non_functional_kernel: + return f"return {ns}::{DispatchKey.CompositeExplicitAutogradNonFunctional.lower()}::{name}({exprs});" + elif f.has_composite_implicit_autograd_kernel: + return f"return {ns}::{DispatchKey.CompositeImplicitAutograd.lower()}::{name}({exprs});" + elif f.has_composite_implicit_autograd_nested_tensor_kernel: + return f"return {ns}::{DispatchKey.CompositeImplicitAutogradNestedTensor.lower()}::{name}({exprs});" + else: + return f"""TORCH_CHECK(false, "Static dispatch does not support {name} for\ +{', '.join([str(index.dispatch_key)for index in backend_indices])} ");""" + + +def static_dispatch( + sig: CppSignature | DispatcherSignature, + f: NativeFunction, + backend_indices: list[BackendIndex], +) -> str: + """ + For a given `NativeFunction`, find out the corresponding backend and dispatch to it. If more than one + backends exsit, fallback to static dispatch by determining dispatch key from inputs. + Arguments: + sig: A CppSignature or DispatcherSignature for this native function we want to use. + f: NativeFunction to generate static dispatch. + backend_indices: All available backends. + Return: + C++ code to call backend-specific functions, e.g., "return at::cpu::add(self, other, scale);" + """ + if len(backend_indices) == 0 or f.manual_kernel_registration: + return "" + + keys = [ + b + for b in backend_indices + if b.has_kernel(f) + or ( + f.structured_delegate is not None + and b.dispatch_key in STRUCTURED_DISPATCH_KEYS + ) + ] + if len(keys) == 1: + return generate_static_dispatch_backend_call(sig, f, keys[0]) + elif len(keys) == 0: + return generate_static_dispatch_fallback_call(sig, f, backend_indices) + + native_tensor_args = [ + a.name + for a in sig.arguments() + if isinstance(a.argument, SelfArgument) + or isinstance(a.argument, Argument) + and a.argument.type.is_tensor_like() + ] + tensor_args = ", ".join(native_tensor_args) + tensor_opts = f.func.arguments.tensor_options + + stmts = [] + subexprs: list[str] = [] + if tensor_opts is not None: + subexprs.append( + "DispatchKeySet(c10::computeDispatchKey(dtype, layout, device))" + ) + if tensor_args != "": + subexprs.append(f"c10::detail::multi_dispatch_key_set({tensor_args})") + stmts.append(f"""DispatchKeySet _dk_set = {' | '.join(subexprs)};""") + stmts.append("DispatchKey _dk = c10::highestPriorityBackendTypeId(_dk_set);") + + dispatch_code = [] + for index in keys: + dispatch_code.append(f"""case DispatchKey::{index.dispatch_key}:""") + dispatch_code.append( + f"""\t{generate_static_dispatch_backend_call(sig, f, index)};""" + ) + + fallback = generate_static_dispatch_fallback_call(sig, f, backend_indices) + connector = "\n\t\t" + + return f""" + {connector.join(stmts)} + switch (_dk) {{ + {connector.join(dispatch_code)} + default: + {fallback} + }} + """ + + +# Generates RegisterSchema.cpp. Depending on the selector, either +# all schemas are registered, or only some are (in the case of +# selective build) +@dataclass(frozen=True) +class RegisterSchema: + selector: SelectiveBuilder + known_tags: dict[str, int] = field(default_factory=dict) + + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + if not self.selector.is_native_function_selected(f): + return None + tags = "{" + ", ".join(f"at::Tag::{tag}" for tag in sorted(f.tags)) + "}" + if tags == "{}": + return f"m.def({cpp_string(str(f.func))}, {{}});\n" + maybe_tags = "" + if tags not in self.known_tags: + idx = len(self.known_tags) + self.known_tags[tags] = idx + maybe_tags = f"const std::vector tags_{idx} = {tags};\n" + return f"{maybe_tags}m.def({cpp_string(str(f.func))}, tags_{self.known_tags[tags]});\n" + + +# Generates Operators.h and Operators.cpp. +# These provide macros that, given an operator and overload name, allow users +# to access an "un-overloaded" function version of the operator. This +# is useful for extension writers who want to (1) want to decltype the operator +# and (2) don't want to worry about method-only operators. +@dataclass(frozen=True) +class ComputeOperators: + target: Literal[Target.DECLARATION, Target.DEFINITION] + static_dispatch_backend_indices: list[BackendIndex] + + @method_with_native_function + def __call__(self, f: NativeFunction) -> str: + sig = DispatcherSignature.from_schema(f.func) + name = f.func.name.unambiguous_name() + + if self.target is Target.DECLARATION: + # Note [The ATen Operators API] + # The ATen Operators API lives in the at::_ops namespace, and contains compile-time + # metadata about each operator + entry points into the Dispatcher. + # The C++ function, method, and redispatch API's are all implemented as wrappers + # into various bits of the structs defined here. + # + # Important characteristics about the Operators API: + # (1) It follows the Dispatcher API. + # This is kind of necessary to avoid overhead. + # For example: if it followed the C++ API, then all of the faithful C++ factory functions + # would need to wrap their arguments into TensorOptions only to unwrap them again. + # (2) Overload names are disambiguated. + # This is helpful for pytorch extenders who would like to decltype() an aten operator, + # that has overloads, e.g. decltype(at::_ops::mul_Tensor::call) + # (3) No argument defaulting is allowed. + # This is more of an implementation detail to avoid #include cycles, + # since TensorBody.h (which defines the Tensor class) needs to include this file. + # (4) manual_cpp_bindings and faithful names are not included in the API. + # This applies to stuff like __dispatch__is_complex(), and add_outf(). + # These aren't "real aten ops", they're just additional functions provided by the C++ API. + # They're implemented as wrappers in Functions.h that call into the actual operators + # defined here, i.e. at::_ops::is_complex::call() and at::_ops::add_out::call(). + # This means that ATEN_OP(is_complex) will not fastpath, and will go through the dispatcher. + return f""" +struct TORCH_API {name} {{ + using schema = {sig.type()}; + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::{f.func.name.name}") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "{f.func.name.overload_name}") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, {cpp_string(str(f.func))}) + static {sig.defn(name="call", is_redispatching_fn=False)}; + static {sig.defn(name="redispatch", is_redispatching_fn=True)}; +}};""" + + elif self.target is Target.DEFINITION: + defns = f""" +STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, name, "aten::{f.func.name.name}") +STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, overload_name, "{f.func.name.overload_name}") +STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, schema_str, {cpp_string(str(f.func))}) + +// aten::{f.func} +static C10_NOINLINE c10::TypedOperatorHandle<{name}::schema> create_{name}_typed_handle() {{ + return c10::Dispatcher::singleton() + .findSchemaOrThrow({name}::name, {name}::overload_name) + .typed<{name}::schema>(); +}} +""" + for is_redispatching_fn in [False, True]: + if is_redispatching_fn: + dispatcher_exprs_str = ", ".join( + ["dispatchKeySet"] + [a.name for a in sig.arguments()] + ) + method_base = "redispatch" + else: + dispatcher_exprs_str = ", ".join([a.name for a in sig.arguments()]) + method_base = "call" + + dispatcher_call = method_base + method_name = f"{name}::{method_base}" + + fn_body = f""" + static auto op = create_{name}_typed_handle(); + return op.{dispatcher_call}({dispatcher_exprs_str});""" + + if ( + not is_redispatching_fn + and len(self.static_dispatch_backend_indices) > 0 + ): + # call() should go through static dispatch + fn_body = static_dispatch( + sig, f, backend_indices=self.static_dispatch_backend_indices + ) + defns += f""" +// aten::{f.func} +{sig.defn(name=method_name, is_redispatching_fn=is_redispatching_fn)} {{ + {fn_body} +}} +""" + return defns + else: + assert_never(self.target) + + +# Generates Functions.h, which provides the functional public C++ API, +# and the scaffolding to call into the dispatcher from these functions. +@dataclass(frozen=True) +class ComputeFunction: + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + sig_group = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=f.manual_cpp_binding + ) + has_symint = f.func.has_symint() + + result = "" + for sig in sig_group.signatures(): + # See Note [The ATen Operators API] + target_sig = DispatcherSignature.from_schema(f.func) + exprs = translate(sig.arguments(), target_sig.arguments()) + exprs_str = ", ".join([e.expr for e in exprs]) + + if sig.symint: + intlike_t = "c10::SymInt" + else: + intlike_t = "int64_t" + + if Variant.function in f.variants: + result += f""" +// aten::{f.func} +inline {sig.decl()} {{ + return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str}); +}}""" + + # The template function can be used from template situations + # where you want to switch between the symint or not version + # depending on a template argument + # + # NB: we ALWAYS generate this even for methods. But we put it in + # this header so it can take advantage of per-op headers + if has_symint: + result += f""" +namespace symint {{ + template ::value>> + {sig.decl(suppress_symint_suffix=True)} {{ + return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str}); + }} +}} +""" + return result + + +# Generates TensorBody.h. This file provides the object-oriented (method-based) +# public C++ API, and the scaffolding to call into the dispatcher from these functions. +@dataclass(frozen=True) +class ComputeTensorMethod: + target: Literal[Target.DECLARATION, Target.DEFINITION] + static_dispatch_backend_indices: list[BackendIndex] + + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + if Variant.method not in f.variants: + return None + + assert not f.func.is_out_fn() + assert f.func.arguments.self_arg is not None + + sig_group = CppSignatureGroup.from_native_function( + f, method=True, fallback_binding=f.manual_cpp_binding + ) + + if self.target is Target.DECLARATION: + result = "" + for sig in sig_group.signatures(): + result += f"{sig.decl()} const;\n" + return result + + if self.target is not Target.DEFINITION: + assert_never(self.target) + + result = "" + + for sig in sig_group.signatures(): + target_sig = DispatcherSignature.from_schema(f.func) + exprs = translate(sig.arguments(), target_sig.arguments(), method=True) + exprs_str = ", ".join([e.expr for e in exprs]) + + result += f""" +// aten::{f.func} +inline {sig.defn(prefix="Tensor::")} const {{ + return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str}); +}} +""" + + return result + + +# Generates RedispatchFunctions.h. +# This is similar to the C++ API defined in Functions.h, but provides access +# to the dispatcher's redispatch API. +@dataclass(frozen=True) +class ComputeRedispatchFunction: + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + # We unconditionally generate function variants of the redispatch API. + # This is mainly because we can namespace functions separately, but not methods, + sig_group = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=f.manual_cpp_binding + ) + + result = "" + for sig in sig_group.signatures(): + target_sig = DispatcherSignature.from_schema(f.func) + exprs = translate(sig.arguments(), target_sig.arguments()) + exprs_str = ", ".join(["dispatchKeySet"] + [a.expr for a in exprs]) + + result += f""" +// aten::{f.func} +inline {sig.decl(is_redispatching_fn=True)} {{ + return at::_ops::{f.func.name.unambiguous_name()}::redispatch({exprs_str}); +}} +""" + + return result + + +# Generates ATenOpList.cpp, a runtime accessible list of all aten +# operators. +# TODO: This was historically used to help some JIT interop code +# figure out whether or not to treat aten namespace'd operators +# one way or another, we should reevaluate if this is actually needed. +@with_native_function +def compute_aten_op(f: NativeFunction) -> str: + return f'{{"aten::{f.func.name.name}", "{f.func.name.overload_name}"}},' + + +# Generates MetaFunctions.h +def compute_meta_function_declaration(g: NativeFunctionsGroup) -> str | None: + if not g.structured: + return None + with native_function_manager(g.out): + name = meta.name(g) + args = structured.meta_arguments(g) + args_str = ", ".join(a.decl() for a in args) + parent_class = g.out.structured_inherits + if parent_class is None: + parent_class = "at::impl::MetaBase" + meta_return = "void" + precomputed = g.out.precomputed if g.structured else None + + if precomputed: + # Generate the template declaration with one bool parameter for each + # precomputed element. Each parameter is true if the corresponding (in + # terms of position) precomputed element has been set. + precomputed_values = [*precomputed.replace.values(), precomputed.add] + precomputed_elements = [ + elem for replace_list in precomputed_values for elem in replace_list + ] + precomputed_template_parameters = [ + elem.name.upper() for elem in precomputed_elements + ] + precomputed_template_params_str = ", ".join( + f"bool {param} = false" for param in precomputed_template_parameters + ) + precompute_template_decl = f"template <{precomputed_template_params_str}>" + + # Generate a string containing declarations of all precomputed elements. + precomputed_elements_with_cpp_types = [ + structured.argument_type(elem, binds=elem.name) + for elem in precomputed_elements + ] + + precomputed_elements_decl = ";\n".join( + f"{elem.cpp_type(strip_ref=True)} {elem.name}" + for elem in precomputed_elements_with_cpp_types + ) + + # Generate "setter" methods for each precomputed element. Each method will return + # a new instance of precompute_out with the template parameter that corresponds to + # the member set by the method to true (to indicate that it has been set). + setter_methods = [] + for i, elem in enumerate(precomputed_elements): + # Generate the signature. The return type will be the same + # as the type of `this` but with the template parameter + # corresponding to the element set by this method set to true. + # The assert generated below will ensure that this template + # parameter is false on the type of `this`. + return_ty_templates = ", ".join( + precomputed_template_parameters[:i] + + ["true"] + + precomputed_template_parameters[i + 1 :] + ) + return_ty = f"precompute_out<{return_ty_templates}>" + elem_cpp_ty = precomputed_elements_with_cpp_types[i].cpp_type( + strip_ref=True + ) + signature = f"{return_ty} set_{elem.name}({elem_cpp_ty} value)" + + # Generate an assert which checks that the + # template parameter corresponding to the precomputed + # element that is set by this method is false on the + # class corresponding to the object that `this` points to. + # This ensures that each element can be set only once. + assert_msg = f'"{elem.name} already set"' + assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});" + + # Generate the new object construction block. All state + # except the element that this method sets is copied from the + # object that `this` points to. The value for the element that + # the method sets is taken from a method parameter. + construction_stmts = [] + construction_stmts.append(f"{return_ty} ret;") + + for j, elem in enumerate(precomputed_elements): + if i == j: + construction_stmts.append(f"ret.{elem.name} = value;") + else: + construction_stmts.append( + f"ret.{elem.name} = this->{elem.name};" + ) + + construction_stmts.append("return ret;") + construction_block = "\n".join(construction_stmts) + + setter_methods.append( + f""" + {signature} {{ + {assert_stmt} + {construction_block} + }} + """ + ) + setter_methods_decl = "\n".join(setter_methods) + + # Meta should return an instance of the struct containing the precomputed elements. + meta_return_template_params = ", ".join( + ["true"] * len(precomputed_template_parameters) + ) + # This typedef (actually a using statement) is needed so that TORCH_META_FUNC can reuse the return + # type (which has a variable number of template parameters). + meta_return_typedef = f"using meta_return_ty = precompute_out <{meta_return_template_params}>;" + meta_return = "meta_return_ty" + precomputed_decl = f""" + {precompute_template_decl} + struct TORCH_API precompute_out {{ + {setter_methods_decl} + {precomputed_elements_decl}; + }};""" + else: + meta_return_typedef = "" + precomputed_decl = "" + + return f"""\ +struct TORCH_API structured_{name} : public {parent_class} {{ + {precomputed_decl} + {meta_return_typedef} + {meta_return} meta({args_str}); +}}; +""" + + +def needs_backend_select(f: NativeFunction, selector: SelectiveBuilder) -> bool: + name = str(f.func.name.name) + if name.endswith("_like") or name.startswith("new_"): + return False + if f.func.arguments.tensor_options is None: + return False + return selector.is_native_function_selected(f) + + +# Generates RegisterBackendSelect.cpp, a series of kernels which provide +# specialized computation of dispatch key for operator signatures which cannot +# be easily done automatically using templating. +@dataclass(frozen=True) +class ComputeBackendSelect: + target: Literal[Target.DEFINITION, Target.REGISTRATION] + + # Selector object to determine which operators to generate + # registration code for. + selector: SelectiveBuilder + + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + if not needs_backend_select(f, self.selector): + return None + + name = native.name(f.func) + # BackendSelect can go to Meta, so it must preserve symints + native_sig = NativeSignature(f.func, symint=True) + + native_tensor_args = [ + a + for a in native_sig.arguments() + if isinstance(a.argument, Argument) and a.argument.type.is_tensor_like() + ] + + dispatcher_sig = DispatcherSignature.from_schema(f.func) + + sig: NativeSignature | DispatcherSignature + sig = dispatcher_sig + dispatcher_exprs = dispatcher_sig.exprs() + dispatch_key = "c10::computeDispatchKey(dtype, layout, device)" + + if self.target is Target.DEFINITION: + # I don't think there's actually a good reason to generate + # these two cases differently + # The first case could probably be improved though- it calls computeDispatchKeySet(), + # which looks at TLS dispatch keys- there should not be any by the time we reach backend select. + if native_tensor_args: + assert f.func.arguments.has_tensor_arg() + tensor_args = ", ".join(a.name for a in native_tensor_args) + compute_dk = f"""\ +DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args}); +DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect); +DispatchKeySet _dk = c10::impl::computeDispatchKeySet(_dk_set, _dk_mask);""" + else: + assert not f.func.arguments.has_tensor_arg() + compute_dk = ( + f"DispatchKeySet _dk = c10::DispatchKeySet({dispatch_key});" + ) + return f"""\ +// aten::{f.func} +C10_ALWAYS_INLINE +{sig.defn(name)} {{ + {compute_dk} + return at::_ops::{f.func.name.unambiguous_name()}::redispatch( + _dk, {', '.join(a.expr for a in dispatcher_exprs)}); +}} +""" + elif self.target is Target.REGISTRATION: + return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" + else: + assert_never(self.target) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# YAML CODE GENERATION +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +def format_yaml(data: object) -> str: + # Ignore alias in Dumper + YamlDumper.ignore_aliases = lambda self, data: True # type: ignore[assignment] + + # Support serializing OrderedDict + def dict_representer(dumper: Any, data: Any) -> Any: + return dumper.represent_dict(data.items()) + + YamlDumper.add_representer(OrderedDict, dict_representer) # type: ignore[no-untyped-call] + # Some yaml parsers (e.g. Haskell's) don't understand line breaks. + # width=1e9 turns off optional line breaks and improves + # the portability of the outputted yaml. + return yaml.dump(data, default_flow_style=False, Dumper=YamlDumper, width=1e9) # type: ignore[no-any-return, call-overload] + + +# For some reason, some defaults we write to YAML are written as native +# YAML objects, rather than doing them uniformly as strings. This +# function detects those cases and converts them into native Python +# objects. +def pythonify_default(s: str) -> object: + if s == "true": + return True + elif s == "false": + return False + + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + return s + + +# What is a dynamic type? Over time, the semantic meaning of +# dynamic type has degraded to meaninglessness (in the old days, +# it captured dtype-ness of types, but that has gone away with +# the removal of TH). These days, it's mostly the same thing as +# the C++ API argument type, except that Tensor and Tensor? +# arguments simply present as Tensor. +# +# TODO: Get rid of dynamic_type, after getting tools/autograd +# to use the new codegen framework +def dynamic_type(t: Type) -> str: + if isinstance(t, OptionalType): + return dynamic_type(t.elem) + # Note we don't use t.is_tensor_like() here because it would + # also include Tensor[] + if str(t) == "Tensor": + return "at::Tensor" + # This is a legacy concept, so never report SymInt + return cpp.argumenttype_type( + t, mutable=False, binds="__placeholder__", symint=False + ).cpp_type() + + +def compute_method_of_yaml(variants: set[Variant]) -> list[str]: + # This is written out explicitly to ensure that Tensor and + # namespace are put into the list in the right order + method_of = ["Type"] + if Variant.method in variants: + method_of.append("Tensor") + if Variant.function in variants: + method_of.append("namespace") + return method_of + + +def compute_returns_yaml( + f: NativeFunction, +) -> tuple[list[dict[str, str]], dict[str, str]]: + # Note [name and field_name] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~ + # To understand name_to_field_name, we must first talk about this + # schema: + # + # lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) + # + # There is something very odd about this schema: it is an out + # variant of the function (that is to say, it will convert into + # at::lstsq_out() in the C++ API), but the names of the output + # return arguments don't match the keyword argument names of + # the inputs. It TURNS OUT that in this situation, the historical + # Declarations.yaml we want to output is this (abbreviated to + # only show relevant fields): + # + # arguments: + # ... + # - field_name: solution + # name: X + # - field_name: QR + # name: qr + # ... + # + # returns: + # - field_name: solution + # name: X + # - field_name: QR + # name: qr + # + # The name of the return fields is stored in 'field_name', and the + # name of the arguments is stored in 'name'. So when we process + # arguments, we need a way to get at the corresponding return. At + # the moment, this is most conveniently done by constructing a + # mapping from name (the argument concept) to field_name (the + # return concept) while processing return arguments, since we don't + # directly maintain this correspondence in the modeling of function + # schema itself. + # + # See also https://github.com/pytorch/pytorch/issues/43114 + name_to_field_name: dict[str, str] = {} + + # Compute the returns field of the YAML entry + names = cpp.return_names(f) + returns = [] + for i, (r, name) in enumerate(zip(f.func.returns, names)): + ret = { + "dynamic_type": dynamic_type(r.type), + "name": name, + # legacy, report ints + "type": cpp.return_type(r, symint=False).cpp_type(), + } + + if r.name: + # See Note [name and field_name] + ret["field_name"] = r.name + if f.func.is_out_fn(): + name_to_field_name[f.func.arguments.out[i].name] = r.name + + returns.append(ret) + + return returns, name_to_field_name + + +# arguments in yaml roughly corresponds to the public C++ API +def compute_cpp_argument_yaml( + cpp_a: Binding, + *, + schema_order: bool, + kwarg_only_set: set[str], + out_arg_set: set[str], + name_to_field_name: dict[str, str], +) -> object: + if isinstance(cpp_a.argument, TensorOptionsArguments): + arg: dict[str, object] = { + "annotation": None, + "dynamic_type": "at::TensorOptions", + "is_nullable": False, + "name": cpp_a.name, + "type": cpp_a.type, + "kwarg_only": True, + } + if cpp_a.default is not None: + arg["default"] = cpp_a.default + return arg + elif isinstance(cpp_a.argument, SelfArgument): + raise AssertionError + elif isinstance(cpp_a.argument, Argument): + return compute_argument_yaml( + cpp_a.argument, + schema_order=schema_order, + kwarg_only_set=kwarg_only_set, + out_arg_set=out_arg_set, + name_to_field_name=name_to_field_name, + ) + + +def compute_argument_yaml( + a: Argument, + *, + schema_order: bool, + kwarg_only_set: set[str], + out_arg_set: set[str], + name_to_field_name: dict[str, str], +) -> object: + arg: dict[str, object] = { + "annotation": str(a.annotation) if a.annotation else None, + "dynamic_type": dynamic_type(a.type), + "is_nullable": a.type.is_nullable(), + "name": a.name, + # legacy, report ints + "type": cpp.argument_type(a, binds="__placeholder__", symint=False).cpp_type(), + } + if a.default is not None: + arg["default"] = pythonify_default( + cpp.default_expr(a.default, a.type, symint=False) + ) + if a.name in kwarg_only_set: + arg["kwarg_only"] = True + if a.name in out_arg_set: + arg["output"] = True + arg["allocate"] = True + # See Note [name and field_name] + if a.name in name_to_field_name: + arg["field_name"] = name_to_field_name[a.name] + # Historically, booleans don't get their size recorded, because it + # is already built into the cpp type (e.g., std::array) + l = a.type.is_list_like() + if l is not None and l.size is not None and str(l.elem) != "bool": + arg["size"] = l.size + return arg + + +@with_native_function +def compute_declaration_yaml(f: NativeFunction) -> object: + returns, name_to_field_name = compute_returns_yaml(f) + + # These sets are used to conveniently test if an argument is a + # kwarg-only or out argument + kwarg_only_set = {a.name for a in f.func.arguments.flat_kwarg_only} + out_arg_set = {a.name for a in f.func.arguments.out} + + sig_group = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=False + ) + cpp_args = sig_group.signature.arguments() + arguments = [ + compute_cpp_argument_yaml( + cpp_a, + schema_order=False, + kwarg_only_set=kwarg_only_set, + out_arg_set=out_arg_set, + name_to_field_name=name_to_field_name, + ) + for cpp_a in cpp_args + ] + + schema_order_jit_arguments = list(f.func.schema_order_arguments()) + + schema_order_arguments = [ + compute_argument_yaml( + a, + schema_order=True, + kwarg_only_set=kwarg_only_set, + out_arg_set=out_arg_set, + name_to_field_name=name_to_field_name, + ) + for a in schema_order_jit_arguments + ] + + cpp_schema_order_types = [ + # NB: method here doesn't matter + r.type + for a in schema_order_jit_arguments + for r in cpp.argument( + a, + method=False, + cpp_no_default_args=set(), + faithful=False, + symint=False, + has_tensor_options=False, + ) + ] + + # legacy, report ints + cpp_returns = cpp.returns_type(f.func.returns, symint=False).cpp_type() + schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})" + + is_factory_method = ( + any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args) + and Variant.method not in f.variants + ) + + return OrderedDict( + [ + ("name", cpp.name(f.func)), + ("operator_name", str(f.func.name.name)), + ("overload_name", str(f.func.name.overload_name)), + ("manual_kernel_registration", f.manual_kernel_registration), + ( + "category_override", + f.category_override if f.category_override is not None else "", + ), + ("schema_string", f"aten::{f.func}"), + ("arguments", arguments), + ("schema_order_cpp_signature", schema_order_cpp_signature), + ("schema_order_arguments", schema_order_arguments), + ("method_of", compute_method_of_yaml(f.variants)), + ("mode", "native"), + ("python_module", "" if f.python_module is None else f.python_module), + ("returns", returns), + ("inplace", f.func.name.name.inplace), + ("is_factory_method", is_factory_method), + ("abstract", f.is_abstract), + ("device_guard", f.device_guard), + ("with_gil", False), + ("deprecated", False), + ("has_math_kernel", f.has_composite_implicit_autograd_kernel), + ] + ) + + +# See Note [Auto generated composite kernels] +def has_autogenerated_composite_kernel(f: NativeFunction) -> bool: + return (f.structured or f.structured_delegate is not None) and ( + f.func.kind() == SchemaKind.functional or f.func.kind() == SchemaKind.inplace + ) + + +@with_native_function_and_indices +def compute_registration_declarations( + f: NativeFunction, backend_indices: dict[DispatchKey, BackendIndex] +) -> str: + name = dispatcher.name(f.func) + returns_type = dispatcher.returns_type( + f.func.returns + ).cpp_type_registration_declarations() + args = dispatcher.arguments(f.func) + args_str = ", ".join(a.no_default().decl_registration_declarations() for a in args) + comment_data: dict[str, str] = { + "schema": f"aten::{f.func}", + # TODO: What exactly is the semantics of the 'dispatch' field? + "dispatch": str( + {k for k, v in backend_indices.items() if v.has_kernel(f)} + != {DispatchKey.CompositeImplicitAutograd} + and {k for k, v in backend_indices.items() if v.has_kernel(f)} + != { + DispatchKey.CompositeImplicitAutograd, + DispatchKey.CompositeImplicitAutogradNestedTensor, + } + ), + "default": str(f.has_composite_kernel or has_autogenerated_composite_kernel(f)), + } + return f"""{returns_type} {name}({args_str}); // {json.dumps(comment_data)} +""" + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# RUN IT ALL +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +def get_custom_build_selector( + provided_op_registration_allowlist: list[str] | None, + op_selection_yaml_path: str | None, +) -> SelectiveBuilder: + assert not ( + provided_op_registration_allowlist is not None + and op_selection_yaml_path is not None + ), ( + "Both provided_op_registration_allowlist and " + + "op_selection_yaml_path can NOT be provided at the " + + "same time." + ) + + op_registration_allowlist: set[str] | None = None + if provided_op_registration_allowlist is not None: + op_registration_allowlist = set(provided_op_registration_allowlist) + + if op_registration_allowlist is not None: + selector = SelectiveBuilder.from_legacy_op_registration_allow_list( + op_registration_allowlist, + True, + False, + ) + elif op_selection_yaml_path is not None: + selector = SelectiveBuilder.from_yaml_path(op_selection_yaml_path) + else: + selector = SelectiveBuilder.get_nop_selector() + + return selector + + +def get_grouped_by_view_native_functions( + native_functions: Sequence[NativeFunction], +) -> Sequence[NativeFunction | NativeFunctionsViewGroup]: + def maybe_create_view_group( + d: dict[ViewSchemaKind | SchemaKind, NativeFunction] + ) -> list[NativeFunction | NativeFunctionsViewGroup]: + funcs: list[NativeFunction | NativeFunctionsViewGroup] = [] + if ViewSchemaKind.aliasing in d: + view = d.pop(ViewSchemaKind.aliasing) + view_inplace = d.pop(ViewSchemaKind.aliasing_inplace, None) + view_copy = d.pop(SchemaKind.functional, None) + + funcs.append( + NativeFunctionsViewGroup( + view=view, + view_copy=view_copy, + view_inplace=view_inplace, + ) + ) + # Take the remaining functions that weren't part of the view group + # and emit them separately + funcs.extend(d.values()) + return funcs + + grouped_by_views: dict[ + FunctionSchema, dict[SchemaKind | ViewSchemaKind, NativeFunction] + ] = defaultdict(dict) + for f in native_functions: + schema = f.func.view_signature() + view_kind: ViewSchemaKind = f.view_schema_kind + # We need to group up ops relevant to the same "view", consisting of: + # view op (ViewSchemaKind.aliasing) + # view_inplace op (ViewSchemaKind.aliasing_inplace) + # view_copy op (SchemaKind.functional) + if view_kind == ViewSchemaKind.non_aliasing: + kind = f.func.kind() + assert kind not in grouped_by_views[schema] + grouped_by_views[schema][kind] = f + else: + assert ( + view_kind not in grouped_by_views[schema] + ), f"{view_kind} already in {grouped_by_views[schema].keys()}" + grouped_by_views[schema][view_kind] = f + + return list(concatMap(maybe_create_view_group, grouped_by_views.values())) + + +def get_grouped_native_functions( + native_functions: Sequence[NativeFunction], +) -> Sequence[NativeFunction | NativeFunctionsGroup]: + def flatten_pre_group( + d: dict[SchemaKind, NativeFunction] + ) -> Sequence[NativeFunction | NativeFunctionsGroup]: + r = NativeFunctionsGroup.from_dict(d) + if r is None: + # Invariant: any NativeFunctions that are code-generated + # should have been grouped into NativeFunctionsGroup objects + assert not any("generated" in f.tags for f in d.values()) + return list(d.values()) + else: + return [r] + + # TODO: how come ValuesView isn't a Sequence lol + pre_grouped_native_functions = pre_group_native_functions(native_functions) + return list( + concatMap(flatten_pre_group, list(pre_grouped_native_functions.values())) + ) + + +def get_ns_grouped_kernels( + *, + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + backend_indices: dict[DispatchKey, BackendIndex], + native_function_decl_gen: Callable[ + [NativeFunctionsGroup | NativeFunction, BackendIndex], list[str] + ] = dest.compute_native_function_declaration, +) -> dict[str, list[str]]: + ns_grouped_kernels: dict[str, list[str]] = defaultdict(list) + for f in grouped_native_functions: + native_function_namespaces = set() + dispatch_keys = set() + for dispatch_key, backend_idx in backend_indices.items(): + backend_metadata = backend_idx.get_kernel(f) + if backend_metadata: + namespace = backend_metadata.cpp_namespace + dispatch_keys.add(dispatch_key) + native_function_namespaces.add(namespace) + else: + namespace = DEFAULT_KERNEL_NAMESPACE + assert ( + len(native_function_namespaces) <= 1 + ), f"Codegen only supports one namespace per operator, got {native_function_namespaces} from {dispatch_keys}" + ns_grouped_kernels[namespace].extend( + native_function_decl_gen(f, backend_idx) + ) + return ns_grouped_kernels + + +def get_native_function_declarations_from_ns_grouped_kernels( + *, + ns_grouped_kernels: dict[str, list[str]], +) -> list[str]: + declarations: list[str] = [] + newline = "\n" + for namespace, kernels in ns_grouped_kernels.items(): + ns_helper = NamespaceHelper( + namespace_str=namespace, + entity_name="", + max_level=4, + ) + # Convert to a set first to remove duplicate kernel names. Backends are + # allowed to repeat kernel names; only generate the declaration once! + ordered_kernels = list(OrderedDict.fromkeys(kernels)) + declarations.extend( + f""" +{ns_helper.prologue} +{newline.join(ordered_kernels)} +{ns_helper.epilogue} + """.split( + newline + ) + ) + return declarations + + +# Return native function declarations grouped by their namespaces. +def get_native_function_declarations( + *, + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + backend_indices: dict[DispatchKey, BackendIndex], + native_function_decl_gen: Callable[ + [NativeFunctionsGroup | NativeFunction, BackendIndex], list[str] + ] = dest.compute_native_function_declaration, +) -> list[str]: + """ + Generate kernel declarations, in `NativeFunction(s).h`. + :param grouped_native_functions: a sequence of `NativeFunction` or `NativeFunctionGroup`. + :param backend_indices: kernel collections grouped by dispatch key. + :param native_function_decl_gen: callable to generate kernel declaration for each `NativeFunction`. + :return: a list of string, from the string with all declarations, grouped by namespaces, split by newline. + """ + + ns_grouped_kernels = get_ns_grouped_kernels( + grouped_native_functions=grouped_native_functions, + backend_indices=backend_indices, + native_function_decl_gen=native_function_decl_gen, + ) + return get_native_function_declarations_from_ns_grouped_kernels( + ns_grouped_kernels=ns_grouped_kernels + ) + + +def get_kernel_namespace( + *, f: NativeFunction | NativeFunctionsGroup, backend_idx: BackendIndex +) -> str: + backend_metadata = backend_idx.get_kernel(f) + assert not backend_metadata or "::native" in backend_metadata.cpp_namespace, ( + f"The kernel for function {f.func.name if isinstance(f, NativeFunction) else f.functional.func.name} " + f"with dispatch key {backend_idx.dispatch_key}" + f" has a namespace {backend_metadata.cpp_namespace} and it's not ending with '::native'." + ) + return ( + backend_metadata.cpp_namespace if backend_metadata else DEFAULT_KERNEL_NAMESPACE + ) + + +# Return native function definitions grouped by dispatch key and custom namespace. +# Used in RegisterDispatchKey.cpp and etc. +def get_native_function_definitions( + *, + fm: FileManager, + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + dispatch_key: DispatchKey, + backend_idx: BackendIndex, + selector: SelectiveBuilder, + rocm: bool, + symint: bool, + skip_dispatcher_op_registration: bool, + gen_dispatch_helpers: bool, +) -> list[str]: + definitions: list[str] = [] + ns_definitions: dict[str, list[str]] = defaultdict(list) + anonymous_definitions: dict[str, list[str]] = defaultdict(list) + registrations: dict[str, dict[str, list[str]]] = defaultdict(dict) + newline = "\n" + ns_gen = dest.RegisterDispatchKey( + backend_idx, + Target.NAMESPACED_DEFINITION, + selector, + rocm=rocm, + symint=symint, + class_method_name=None, + skip_dispatcher_op_registration=skip_dispatcher_op_registration, + ) + anonymous_gen = dest.RegisterDispatchKey( + backend_idx, + Target.ANONYMOUS_DEFINITION, + selector, + rocm=rocm, + symint=symint, + class_method_name=None, + skip_dispatcher_op_registration=skip_dispatcher_op_registration, + ) + reg_gen = dest.RegisterDispatchKey( + backend_idx, + Target.REGISTRATION, + selector, + rocm=rocm, + symint=symint, + class_method_name=None, + skip_dispatcher_op_registration=skip_dispatcher_op_registration, + ) + for f in grouped_native_functions: + kernel_namespace = get_kernel_namespace(f=f, backend_idx=backend_idx).replace( + "::native", "" + ) + + ns_definitions[kernel_namespace].extend( + ns_gen(f), + ) + anonymous_definitions[kernel_namespace].extend( + anonymous_gen(f), + ) + namespace = ( + f.namespace if isinstance(f, NativeFunction) else f.functional.namespace + ) + if namespace not in registrations[kernel_namespace]: + registrations[kernel_namespace] = defaultdict(list) + registrations[kernel_namespace][namespace].extend( + reg_gen(f), + ) + + for kernel_namespace in ns_definitions: + if len(ns_definitions[kernel_namespace]) == 0: + continue + ns_helper = NamespaceHelper(namespace_str=kernel_namespace) + registration_body = "" + for namespace in registrations[kernel_namespace]: + if not registrations[kernel_namespace][namespace]: + continue + registration_body += f""" +TORCH_LIBRARY_IMPL({namespace}, {dispatch_key}, m) {{ + {newline.join(registrations[kernel_namespace][namespace])} +}};""" + definitions.extend( + fm.substitute_with_template( + "RegisterDispatchDefinitions.ini", + lambda: { + "ns_prologue": ns_helper.prologue, + "ns_epilogue": ns_helper.epilogue, + "dispatch_helpers": dest.gen_registration_helpers(backend_idx) + if gen_dispatch_helpers + else [], + "dispatch_anonymous_definitions": anonymous_definitions[ + kernel_namespace + ], + "static_init_dispatch_registrations": "" + if skip_dispatcher_op_registration + else registration_body, + "deferred_dispatch_registrations": "", + "dispatch_namespace": dispatch_key.lower(), + "dispatch_namespaced_definitions": ns_definitions[kernel_namespace], + }, + ).split(newline) + ) + + return definitions + + +# Return native function declarations grouped by dispatch key and custom namespace. +# Used in CPUFunctions_inl.h and etc. +def get_namespaced_declaration( + *, + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + dispatch_key: DispatchKey, + backend_idx: BackendIndex, + selector: SelectiveBuilder, + rocm: bool, + symint: bool, +) -> list[str]: + declarations: list[str] = [] + ns_grouped_kernels: dict[str, list[str]] = defaultdict(list) + newline = "\n" + func = dest.RegisterDispatchKey( + backend_idx, + Target.NAMESPACED_DECLARATION, + selector, + rocm=rocm, + class_method_name=None, + skip_dispatcher_op_registration=False, + symint=symint, + ) + for f in grouped_native_functions: + namespace = get_kernel_namespace(f=f, backend_idx=backend_idx).replace( + "native", dispatch_key.lower() + ) + + ns_grouped_kernels[namespace].extend( + func(f), + ) + + for namespace, kernels in ns_grouped_kernels.items(): + if len(kernels) == 0: + continue + ns_helper = NamespaceHelper( + namespace_str=namespace, entity_name="", max_level=3 + ) + ordered_kernels = list(OrderedDict.fromkeys(kernels)) + declarations.extend( + f""" +{ns_helper.prologue} +{newline.join(ordered_kernels)} +{ns_helper.epilogue} + """.split( + newline + ) + ) + return declarations + + +# Return native function schema registration code for aten and other namespaces. +def get_native_function_schema_registrations( + *, + native_functions: Sequence[NativeFunction], + schema_selector: SelectiveBuilder, +) -> tuple[list[str], str]: + ns_native_functions: dict[str, list[NativeFunction]] = defaultdict(list) + for native_function in native_functions: + ns_native_functions[native_function.namespace].append(native_function) + schema_registrations = "" + aten_schema_registrations = [] + custom_namespace = None + for namespace, funcs in ns_native_functions.items(): + schema_registrations_body = list( + mapMaybe(RegisterSchema(schema_selector), funcs) + ) + # NB: we have to separate aten namespace registration from other namespaces, + # because in the template we hardcoded an operator for ATen already. + if namespace == "aten": + aten_schema_registrations = schema_registrations_body + else: + custom_namespace = namespace + tab = "\t" + # if the namespace is predefined, we should use define a library fragment + # instead of a new library + torch_library_macro = ( + "TORCH_LIBRARY_FRAGMENT" + if namespace in FRAGMENT_NAMESPACES + else "TORCH_LIBRARY" + ) + schema_registrations += f""" +{torch_library_macro}({custom_namespace}, m) {{ + {tab.join(schema_registrations_body)} +}};""" + return (aten_schema_registrations, schema_registrations) + + +def gen_aggregated_headers( + *, + native_functions: Sequence[NativeFunction], + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + structured_native_functions: Sequence[NativeFunctionsGroup], + static_dispatch_idx: list[BackendIndex], + selector: SelectiveBuilder, + backend_indices: dict[DispatchKey, BackendIndex], + cpu_fm: FileManager, + cuda_fm: FileManager, + functions_keys: set[DispatchKey], + dispatch_keys: Sequence[DispatchKey], + rocm: bool, +) -> None: + # Buck doesn't support dynamic output files, so we aggregate all operator + # headers into a single file + cpu_fm.write( + "NativeMetaFunctions.h", + lambda: { + "NativeMetaFunctions_includes": [], + "NativeMetaFunctions_declarations": list( + mapMaybe(compute_meta_function_declaration, structured_native_functions) + ), + }, + ) + method_native_functions = [ + fn for fn in native_functions if Variant.method in fn.variants + ] + non_method_native_functions = [ + fn for fn in native_functions if fn not in method_native_functions + ] + cpu_fm.write( + "MethodOperators.h", + lambda: { + "MethodOperators_includes": [], + "MethodOperators_declarations": list( + mapMaybe( + ComputeOperators( + Target.DECLARATION, + static_dispatch_backend_indices=static_dispatch_idx, + ), + method_native_functions, + ) + ), + }, + ) + cpu_fm.write( + "Operators.h", + lambda: { + "Operators_includes": ["#include "], + "Operators_declarations": list( + mapMaybe( + ComputeOperators( + Target.DECLARATION, + static_dispatch_backend_indices=static_dispatch_idx, + ), + non_method_native_functions, + ) + ), + }, + ) + cpu_fm.write( + "Functions.h", + lambda: { + "static_dispatch_extra_headers": static_dispatch_extra_headers( + static_dispatch_idx + ), + "Functions_includes": ["#include "], + "Functions_declarations": list( + mapMaybe( + ComputeFunction(), + native_functions, + ) + ), + }, + ) + declarations = get_native_function_declarations( + grouped_native_functions=grouped_native_functions, + backend_indices=backend_indices, + ) + cpu_fm.write( + "NativeFunctions.h", + lambda: { + "NativeFunctions_includes": ["#include "], + "NativeFunctions_declarations": declarations, + }, + ) + + for dispatch_key in dispatch_keys: + fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm + if dispatch_key in functions_keys: + inl_headers = f"#include " + + fm.write_with_template( + f"{dispatch_key}Functions.h", + "DispatchKeyFunctions.h", + lambda: { + "dispatch_key": str(dispatch_key), + "inline_headers": inl_headers, + }, + ) + fm.write_with_template( + f"{dispatch_key}Functions_inl.h", + "DispatchKeyFunctions_inl.h", + lambda: { + "DispatchKeyFunctions_inl_includes": [], + "dispatch_namespace": dispatch_key.lower(), + "dispatch_namespaced_declarations": get_namespaced_declaration( + grouped_native_functions=grouped_native_functions, + dispatch_key=dispatch_key, + backend_idx=backend_indices[dispatch_key], + selector=selector, + rocm=rocm, + symint=True, + ), + }, + ) + + del fm + + +def gen_per_operator_headers( + *, + native_functions: Sequence[NativeFunction], + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + static_dispatch_idx: list[BackendIndex], + selector: SelectiveBuilder, + backend_indices: dict[DispatchKey, BackendIndex], + cpu_fm: FileManager, + cuda_fm: FileManager, + ops_fm: FileManager, + functions_keys: set[DispatchKey], + dispatch_keys: Sequence[DispatchKey], + rocm: bool, +) -> None: + # For CMake builds, split operator declarations into separate headers in + # the ATen/ops folder to split up header dependencies + functions_by_root_name: dict[str, list[NativeFunction]] = defaultdict(list) + for fn in native_functions: + functions_by_root_name[fn.root_name].append(fn) + + grouped_functions_by_root_name: dict[ + str, list[NativeFunction | NativeFunctionsGroup] + ] = defaultdict(list) + for group in grouped_native_functions: + name = group.root_name + grouped_functions_by_root_name[name].append(group) + + for name, functions in functions_by_root_name.items(): + ops_fm.write_with_template( + f"{name}_ops.h", + "Operator.h", + lambda: { + "declarations": list( + mapMaybe( + ComputeOperators( + Target.DECLARATION, + static_dispatch_backend_indices=static_dispatch_idx, + ), + functions, + ) + ), + }, + ) + + ops_fm.write_with_template( + f"{name}.h", + "Function.h", + lambda: { + "static_dispatch_ops_headers": list( + mapMaybe( + lambda fn: static_dispatch_ops_header( + fn, backend_index=static_dispatch_idx + ), + functions, + ) + ), + "operator_includes": f"#include ", + "function_definitions": list( + mapMaybe( + ComputeFunction(), + functions, + ) + ), + }, + ) + + grouped_functions = grouped_functions_by_root_name.get(name, []) + structured_functions = [ + fn + for fn in grouped_functions + if isinstance(fn, NativeFunctionsGroup) and fn.structured + ] + is_structured = len(structured_functions) > 0 + + if is_structured: + ops_fm.write_with_template( + f"{name}_meta.h", + "NativeMetaFunction.h", + lambda: { + "meta_function_declarations": list( + mapMaybe( + compute_meta_function_declaration, structured_functions + ) + ), + }, + ) + declarations = get_native_function_declarations( + grouped_native_functions=grouped_functions, + backend_indices=backend_indices, + native_function_decl_gen=dest.compute_native_function_declaration, + ) + ops_fm.write_with_template( + f"{name}_native.h", + "NativeFunction.h", + lambda: { + "extra_includes": ( + f"#include " if is_structured else [] + ), + "native_function_declarations": declarations, + }, + ) + + for category, suffix in [ + ("Functions", ""), + ("Operators", "_ops"), + ("NativeMetaFunctions", "_meta"), + ("NativeFunctions", "_native"), + ]: + cpu_fm.write( + f"{category}.h", + lambda: { + f"{category}_includes": [ + f"#include " + for name in sorted(functions_by_root_name.keys()) + ], + f"{category}_declarations": [], + }, + ) + + for dispatch_key in dispatch_keys: + if dispatch_key not in functions_keys: + continue + + dispatch_namespace = dispatch_key.lower() + dispatch_names = [] + + for name, functions in functions_by_root_name.items(): + grouped_functions = grouped_functions_by_root_name.get(name, []) + declarations = list( + concatMap( + dest.RegisterDispatchKey( + backend_indices[dispatch_key], + Target.NAMESPACED_DECLARATION, + selector, + rocm=rocm, + symint=True, + class_method_name=None, + skip_dispatcher_op_registration=False, + ), + grouped_functions, + ) + ) + + if len(declarations) == 0: + continue + + dispatch_names.append(name) + ops_fm.write_with_template( + f"{name}_{dispatch_namespace}_dispatch.h", + "DispatchKeyFunction.h", + lambda: { + "dispatch_namespace": dispatch_namespace, + "dispatch_namespaced_declarations": declarations, + }, + ) + + fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm + inl_headers = f"#include " + + fm.write_with_template( + f"{dispatch_key}Functions.h", + "DispatchKeyFunctions.h", + lambda: { + "dispatch_key": str(dispatch_key), + "inline_headers": inl_headers, + }, + ) + fm.write_with_template( + f"{dispatch_key}Functions_inl.h", + "DispatchKeyFunctions_inl.h", + lambda: { + "dispatch_namespace": dispatch_namespace, + "DispatchKeyFunctions_inl_includes": [ + f"#include " + for name in sorted(dispatch_names) + ], + "dispatch_namespaced_declarations": [], + }, + ) + del fm + + cpu_fm.write( + "MethodOperators.h", + lambda: { + "MethodOperators_includes": sorted( + f"#include " + for name, functions in functions_by_root_name.items() + if any(Variant.method in fn.variants for fn in functions) + ), + "MethodOperators_declarations": [], + }, + ) + + +def gen_headers( + *, + native_functions: Sequence[NativeFunction], + valid_tags: set[str], + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + structured_native_functions: Sequence[NativeFunctionsGroup], + static_dispatch_idx: list[BackendIndex], + selector: SelectiveBuilder, + backend_indices: dict[DispatchKey, BackendIndex], + core_fm: FileManager, + cpu_fm: FileManager, + cuda_fm: FileManager, + ops_fm: FileManager, + dispatch_keys: Sequence[DispatchKey], + functions_keys: set[DispatchKey], + rocm: bool, + per_operator_headers: bool, +) -> None: + if per_operator_headers: + gen_per_operator_headers( + native_functions=native_functions, + grouped_native_functions=grouped_native_functions, + static_dispatch_idx=static_dispatch_idx, + selector=selector, + backend_indices=backend_indices, + cpu_fm=cpu_fm, + cuda_fm=cuda_fm, + ops_fm=ops_fm, + dispatch_keys=dispatch_keys, + functions_keys=functions_keys, + rocm=rocm, + ) + else: + gen_aggregated_headers( + native_functions=native_functions, + grouped_native_functions=grouped_native_functions, + structured_native_functions=structured_native_functions, + static_dispatch_idx=static_dispatch_idx, + selector=selector, + backend_indices=backend_indices, + cpu_fm=cpu_fm, + cuda_fm=cuda_fm, + dispatch_keys=dispatch_keys, + functions_keys=functions_keys, + rocm=rocm, + ) + + core_fm.write( + "TensorBody.h", + lambda: { + "tensor_method_declarations": list( + mapMaybe( + ComputeTensorMethod( + target=Target.DECLARATION, + static_dispatch_backend_indices=static_dispatch_idx, + ), + native_functions, + ) + ), + "tensor_method_definitions": list( + mapMaybe( + ComputeTensorMethod( + target=Target.DEFINITION, + static_dispatch_backend_indices=static_dispatch_idx, + ), + native_functions, + ) + ), + }, + ) + + cpu_fm.write( + "RedispatchFunctions.h", + lambda: { + "function_redispatch_definitions": list( + mapMaybe(ComputeRedispatchFunction(), native_functions) + ), + }, + ) + + cpu_fm.write( + "RegistrationDeclarations.h", + lambda: { + "registration_declarations": [ + compute_registration_declarations(f, backend_indices) + for f in native_functions + ], + }, + ) + + cpu_fm.write( + "VmapGeneratedPlumbing.h", lambda: gen_all_vmap_plumbing(native_functions) + ) + + def gen_aten_interned_strings() -> dict[str, str]: + attrs: set[str] = set() # All function argument names + names = set() # All ATen function names + for func in native_functions: + names.add(str(func.func.name.name)) + # Some operators don't have a functional variant but we still create a + # symbol without the underscore + names.add(func.func.name.name.base) + + attrs.update(arg.name for arg in func.func.schema_order_arguments()) + + # These are keywords in C++, so aren't valid symbol names + # https://en.cppreference.com/w/cpp/language/operator_alternative + names -= { + "and", + "and_eq", + "bitand", + "bitor", + "compl", + "not", + "not_eq", + "or", + "or_eq", + "xor", + "xor_eq", + } + + return { + "aten_symbols": " \\\n".join( + [f"_(aten, {name})" for name in sorted(names)] + ), + "attr_symbols": " \\\n".join( + [f"_(attr, {name})" for name in sorted(attrs)] + ), + } + + core_fm.write("aten_interned_strings.h", gen_aten_interned_strings) + + def gen_tags_enum() -> dict[str, str]: + return {"enum_of_valid_tags": (",\n".join(sorted(valid_tags)))} + + core_fm.write("enum_tag.h", gen_tags_enum) + + +def gen_source_files( + *, + native_functions: Sequence[NativeFunction], + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + structured_native_functions: Sequence[NativeFunctionsGroup], + view_groups: Sequence[NativeFunctionsViewGroup], + selector: SelectiveBuilder, + static_dispatch_idx: list[BackendIndex], + backend_indices: dict[DispatchKey, BackendIndex], + aoti_fm: FileManager, + core_fm: FileManager, + cpu_fm: FileManager, + cpu_vec_fm: FileManager, + cuda_fm: FileManager, + dispatch_keys: Sequence[DispatchKey], + functions_keys: set[DispatchKey], + rocm: bool, + force_schema_registration: bool, + per_operator_headers: bool, + skip_dispatcher_op_registration: bool, + update_aoti_c_shim: bool, +) -> None: + extra_cuda_headers = """\ +#include +#include +#include +#include """ + if rocm: + extra_cuda_headers = """\ +#include +#include +#include +#include """ + + for dispatch_key in dispatch_keys: + fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm + + if per_operator_headers: + + def operator_headers() -> list[str]: + headers = [] + for g in grouped_native_functions: + is_registered = False + if backend_index.has_kernel(g): + is_registered = True + # The above has_kernel test on a group will only test for + # the existence of out dispatch, because that's how + # structured kernels work. But sometimes functions can be + # grouped but not be structured, and then you need to check + # each individual piece, as they may have manual dispatch + # entries. + elif isinstance(g, NativeFunctionsGroup) and any( + backend_index.has_kernel(fn) for fn in g.functions() + ): + is_registered = True + # TODO: this condition is a bit questionable + # (It has to do with the fact that structured kernels get generated kernels + # to the Meta + CompositeExplicitAutogradNonFunctional keys). + elif g.structured and dispatch_key in ( + DispatchKey.Meta, + DispatchKey.CompositeExplicitAutogradNonFunctional, + ): + is_registered = True + if not is_registered: + continue + + headers.append(f"#include ") + if ( + dispatch_key + == DispatchKey.CompositeExplicitAutogradNonFunctional + ): + headers.append(f"#include ") + if dispatch_key in functions_keys: + headers.append( + f"#include " + ) + + return sorted(set(headers)) + + else: + + def operator_headers() -> list[str]: + headers = ["#include "] + if dispatch_key == DispatchKey.CompositeExplicitAutogradNonFunctional: + headers.append("#include ") + if dispatch_key in functions_keys: + headers.append(f"#include ") + return headers + + backend_index = backend_indices[dispatch_key] + ns_grouped_native_functions = defaultdict(list) + for grouped_native_function in grouped_native_functions: + namespace = ( + grouped_native_function.namespace + if isinstance(grouped_native_function, NativeFunction) + else grouped_native_function.functional.namespace + ) + ns_grouped_native_functions[namespace].append(grouped_native_function) + + dispatch_namespace = str(dispatch_key).lower() + + # CompositeImplicitAutogradNestdTensor does not currently user the helpers generated + # compilation will fail when `-Werror=unused-function` flag is set + gen_dispatch_helpers: bool = ( + dispatch_key != DispatchKey.CompositeImplicitAutogradNestedTensor + ) + + dispatch_definitions = get_native_function_definitions( + fm=fm, + grouped_native_functions=grouped_native_functions, + dispatch_key=dispatch_key, + backend_idx=backend_index, + selector=selector, + rocm=rocm, + symint=True, + skip_dispatcher_op_registration=skip_dispatcher_op_registration, + gen_dispatch_helpers=gen_dispatch_helpers, + ) + fm.write_with_template( + f"Register{dispatch_key}.cpp", + "RegisterDispatchKey.cpp", + lambda: { + "extra_cuda_headers": extra_cuda_headers + if is_cuda_dispatch_key(dispatch_key) + else "", + "external_backend_headers": "", + "dispatch_headers": dest.gen_registration_headers( + backend_index, per_operator_headers, rocm + ), + "ops_headers": operator_headers(), + "dispatch_helpers": "", + "dispatch_definitions": dispatch_definitions, + }, + ) + + for g in structured_native_functions: + if not g.out.ufunc_inner_loop or not is_ufunc_dispatch_key(dispatch_key): + continue + name = g.functional.func.name.name + if dispatch_key is DispatchKey.CPU: + assert fm is cpu_fm + fm.write_with_template( + f"UfuncCPU_{name}.cpp", + "UfuncCPU.cpp", + lambda: { + "meta_declaration": compute_meta_function_declaration(g), + "native_declaration": dest.compute_native_function_declaration( + g, backend_indices[dispatch_key] + ), + "native_definitions": dest.compute_ufunc_cpu(g), + }, + ) + cpu_vec_fm.write_with_template( + f"UfuncCPUKernel_{name}.cpp", + "UfuncCPUKernel.cpp", + lambda: { + "name": name, + "native_definitions": dest.compute_ufunc_cpu_kernel(g), + }, + ) + elif dispatch_key is DispatchKey.CUDA: + cuda_headers = "#include " + if rocm: + cuda_headers = "#include " + fm.write_with_template( + f"UfuncCUDA_{name}.cu", + "UfuncCUDA.cu", + lambda: { + "name": name, + "cuda_headers": cuda_headers, + "meta_declaration": compute_meta_function_declaration(g), + "native_declaration": dest.compute_native_function_declaration( + g, backend_indices[dispatch_key] + ), + "native_definitions": dest.compute_ufunc_cuda(g), + }, + ) + else: + raise AssertionError(f"unrecognized {dispatch_key} for ufunc") + + structured_func_group_dict = {} + for func_group in structured_native_functions: + for func in func_group.functions(): + if func.structured_delegate is not None: + structured_func_group_dict[func.structured_delegate] = func_group + break + + if dispatch_key in (DispatchKey.CPU, DispatchKey.CUDA): + fallbacks = {} + for func in native_functions: + op_name = get_fallback_op_name(func) + if op_name in inductor_fallback_ops: + fallbacks[op_name] = func + fallback_native_functions = tuple( + value for _, value in sorted(fallbacks.items()) + ) + + # header files were checked in for ABI-compatiblilty checking + header_file_name = f"c_shim_{dispatch_key.lower()}.h" + new_header = gen_aoti_c_shim( + fallback_native_functions, + structured_func_group_dict, + dispatch_key, + backend_indices, + header=True, + includes="", + ) + if update_aoti_c_shim: + aoti_fm.write( + header_file_name, + lambda: new_header, + ) + else: + try: + with open( + os.path.join(aoti_fm.install_dir, header_file_name) + ) as old_file: + old_header = old_file.read() + assert ( + old_header == new_header + ), """ + +WARNING: The generated AOTInductor C shim header files have unexpectedly changed. This +indicates an AOTInductor fallback operator ABI backward compatibility breakage!!! +Only in a limited number of situations, this is allowed: + +1. You added a fallback op to the inductor_fallback_ops list in torchgen/aoti/fallback_ops.py. +If that's the case, run `python torchgen/gen.py --update-aoti-c-shim` to update the existing +C shim header files. + +2. You added a new default argument to an existing fallback op. This is clearly a BC breaking +change in the AOTInductor land. In this case, you need to keep a manual copy of that existing +fallback op in a file, e.g. torch/csrc/inductor/aoti_torch/c/shim.h, bump up the version +number of that fallback op in the newly generated C shim files, and update the cpp wrapper +codegen to generate the correct cpp call for this op. Contact AOTInductor team for assistance. + + """ + except FileNotFoundError: + print( + f"{os.path.join(aoti_fm.install_dir, header_file_name)} not found" + ) + + # cpp files are always generated on-the-fly + def headers_for_aoti() -> str: + headers = [] + for func in fallback_native_functions: + header = get_header_for_aoti( + func, structured_func_group_dict, dispatch_key, backend_indices + ) + if header is not None: + headers.append(header) + return "\n".join(sorted(set(headers))) + + extra_headers = ( + extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else "" + ) + + aoti_fm.write( + f"c_shim_{dispatch_key.lower()}.cpp", + lambda: gen_aoti_c_shim( + fallback_native_functions, + structured_func_group_dict, + dispatch_key, + backend_indices, + header=False, + includes=headers_for_aoti() + "\n" + extra_headers, + ), + ) + + del fm + + # BackendSelect is generated specially + def gen_backend_select() -> dict[str, list[str]]: + relevant_fns = [ + fn for fn in native_functions if needs_backend_select(fn, selector) + ] + return { + "ops_headers": [ + f"#include " for fn in relevant_fns + ], + "backend_select_method_definitions": list( + mapMaybe( + ComputeBackendSelect(Target.DEFINITION, selector), relevant_fns + ) + ), + "backend_select_function_registrations": list( + mapMaybe( + ComputeBackendSelect(Target.REGISTRATION, selector), relevant_fns + ) + ), + } + + cpu_fm.write("RegisterBackendSelect.cpp", gen_backend_select) + + schema_selector = selector + if force_schema_registration: + schema_selector = SelectiveBuilder.get_nop_selector() + + ( + aten_schema_registrations, + schema_registrations, + ) = get_native_function_schema_registrations( + native_functions=native_functions, schema_selector=schema_selector + ) + cpu_fm.write( + "RegisterSchema.cpp", + lambda: { + "aten_schema_registrations": [] + if skip_dispatcher_op_registration + else aten_schema_registrations, + "schema_registrations": [] + if skip_dispatcher_op_registration + else schema_registrations, + }, + ) + + def key_func( + fn: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup, + ) -> str: + return fn.root_name + + cpu_fm.write_sharded( + "Operators.cpp", + native_functions, + key_fn=key_func, + env_callable=lambda fn: { + "operator_headers": [f"#include "], + "definitions": [ + ComputeOperators( + Target.DEFINITION, + static_dispatch_backend_indices=static_dispatch_idx, + )(fn) + ], + }, + base_env={ + "static_dispatch_extra_headers": static_dispatch_extra_headers( + static_dispatch_idx + ), + }, + num_shards=5, + sharded_keys={ + "operator_headers", + "definitions", + "static_dispatch_extra_headers", + }, + ) + + cpu_fm.write("Functions.cpp", dict) + + core_fm.write("TensorMethods.cpp", dict) + + core_fm.write( + "ATenOpList.cpp", + lambda: { + "aten_ops": list(mapMaybe(compute_aten_op, native_functions)), + }, + ) + + def functionalization_env_callable( + g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup, + ) -> dict[str, list[str]]: + def gen_op_headers( + g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup, + ) -> list[str]: + if isinstance(g, NativeFunctionsViewGroup): + # view ops always get a functionalization kernel + headers = [ + f"#include ", + f"#include ", + ] + if g.view_copy is not None: + headers += [ + f"#include ", + f"#include ", + ] + return headers + elif isinstance(g, NativeFunctionsGroup): + headers = [ + f"#include ", + f"#include ", + f"#include ", + f"#include ", + ] + if g.inplace is not None: + headers += [ + f"#include ", + f"#include ", + ] + if g.mutable is not None: + headers += [ + f"#include ", + f"#include ", + ] + return headers + else: + return [ + f"#include ", + f"#include ", + ] + + return { + "ops_headers": gen_op_headers(g), + "func_definitions": gen_functionalization_definition( + selector, + g, + ), + "func_registrations": gen_functionalization_registration( + selector, + g, + backend_indices[DispatchKey.CompositeImplicitAutograd], + ), + } + + all_groups: list[ + NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup + ] = list(structured_native_functions) + list( + view_groups # type: ignore[assignment, arg-type, operator] + ) + # Note: all operators that functionalization needs to handle (mutable and aliasing ops) should be grouped properly. + # The only reason we really need to deal with direct NativeFunctions here (instead of the groups) is because: + # (1) We can provide better error checking (error out if someone introduces a mutable op that doesn't obey the grouping logic) + # (2) functionalization needs to manually register CompositeImplicitAutograd kernels, which might not be grouped. + # Although this could go away long-term if we add a dedicated dispatch key for decompositions. + structured_map: dict[OperatorName, NativeFunction] = { + f.func.name: f + for f in concatMap(lambda g: list(g.functions()), structured_native_functions) + } + view_map: dict[OperatorName, NativeFunction] = { + f.func.name: f for f in concatMap(lambda g: list(g.functions()), view_groups) + } + for f in native_functions: + if f.func.name not in structured_map and f.func.name not in view_map: + all_groups.append(f) + + cpu_fm.write_sharded( + "RegisterFunctionalization.cpp", + all_groups, + key_fn=key_func, + env_callable=functionalization_env_callable, + num_shards=4, + sharded_keys={ + "ops_headers", + "func_definitions", + "func_registrations", + "func_add_back_views_definitions", + "func_add_back_views_registrations", + }, + ) + + cpu_fm.write( + "FunctionalInverses.h", + lambda: { + "view_inverse_declarations": list( + mapMaybe( + lambda g: gen_functionalization_view_inverse_declaration( + selector, g + ), + view_groups, + ) + ) + }, + ) + + # Note [view_copy NativeFunctions] + # Every view operator in native_functions.yaml that is not CompositeImplicitAutograd + # needs to have a corresponding non-aliasing {view}_copy variant. + # Backends that use functionalization and don't know how to handle aliasing ops + # are expected to implement kernels for these {view}_copy kernels instead. + # The code for {view}_copy operators in core is pretty boilerplate-heavy however, + # so we codegen the following: + # (1) A CompositeExplicitAutogradNonFunctional kernel for every {view}_copy operator. + # These are never explicitly invoked by the functionalization pass, + # but they could theoretically be called from user code (I added these kernels for completeness, + # since the ops are part of the public API). + # (2) A derivative formula for every {view}_copy operator + # {view}_copy operators can re-use the same derivative formulas as their {view} op counterparts, + # so rather than stamping all of the entries out in derivatives.yaml, + # we codegen them in. + # This is similar to how autograd codegen doesn't require inplace ops to have a derivatives.yaml entry. + cpu_fm.write( + "CompositeViewCopyKernels.cpp", + lambda: { + "ops_headers": [ + "\n".join( + f"#include \n" + # NB: this include is important as it ensures we + # set the visibility on generated view_copy kernels + # correctly + f"#include " + for f in ( + [g.view] if g.view_copy is None else [g.view, g.view_copy] + ) + ) + for g in view_groups + ] + + [ + "\n".join( + f"#include \n" + # NB: this include is also important for correct visibility + f"#include " + for f in [g.inplace, g.mutable, g.functional] + if f is not None and "generated" not in f.tags + ) + for g in structured_native_functions + ], + "CompositeViewCopyKernel_Definitions": list( + mapMaybe( + GenCompositeViewCopyKernel( + backend_indices[ + DispatchKey.CompositeExplicitAutogradNonFunctional + ] + ), + view_groups, + ) + ), + "GeneratedCompositeFunctional_Definitions": list( + mapMaybe( + gen_composite_functional_kernel, + structured_native_functions, + ) + ), + "GeneratedCompositeOut_Definitions": list( + mapMaybe( + gen_composite_out_kernel, + structured_native_functions, + ) + ), + }, + ) + + +def gen_declarations_yaml( + cpu_fm: FileManager, native_functions: Sequence[NativeFunction] +) -> None: + cpu_fm.write( + "Declarations.yaml", + lambda: format_yaml([compute_declaration_yaml(f) for f in native_functions]), + ) + + +def get_torchgen_root() -> Path: + """ + If you're depending on torchgen out-of-tree, you can use the root to figure + out the path to native_functions.yaml + """ + return Path(__file__).parent.resolve() + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate ATen source files") + parser.add_argument( + "-s", + "--source-path", + help="path to source directory for ATen", + default="aten/src/ATen", + ) + parser.add_argument( + "-o", + "--output-dependencies", + help="output a list of dependencies into the given file and exit", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="run without writing any files (still updates outputs)", + ) + parser.add_argument( + "--per-operator-headers", + action="store_true", + help="generate separate headers per operator in ATen/ops", + ) + parser.add_argument( + "-d", + "--install-dir", + "--install_dir", + help="output directory", + default="build/aten/src/ATen", + ) + parser.add_argument( + "--aoti-install-dir", + "--aoti_install_dir", + help="output directory for AOTInductor shim", + default="torch/csrc/inductor/aoti_torch/generated", + ) + parser.add_argument( + "--rocm", + action="store_true", + help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly", + ) + parser.add_argument( + "--mps", + action="store_true", + help="Generate MPS registration code when set", + ) + # TODO: --op-registration-whitelist will be removed when all call-sites + # for gen.py are moved over to using the operator YAML file for mobile + # custom build. + parser.add_argument( + "--op-registration-whitelist", + "--op_registration_whitelist", + nargs="*", + help="filter op registrations by the whitelist (if set); " + "each item is `namespace`::`operator name` without overload name; " + "e.g.: aten::empty aten::conv2d ...", + ) + parser.add_argument( + "--op-selection-yaml-path", + "--op_selection_yaml_path", + help="Provide a path to the operator selection (for custom build) YAML " + "that contains the information about the set of selected operators " + "and their categories (training, ...). Each operator is either a " + "full operator name with overload or just a bare operator name. " + "The operator names also contain the namespace prefix (e.g. aten::)", + ) + parser.add_argument( + "--backend-whitelist", + "--backend_whitelist", + nargs="*", + help="filter dispatch backend by the whitelist (if set), " + "e.g.: CPU CUDA QuantizedCPU ...", + ) + parser.add_argument( + "--static-dispatch-backend", + "--static_dispatch_backend", + nargs="*", + help="generate static dispatch code for the specific backend (if set)", + ) + parser.add_argument( + "--skip-dispatcher-op-registration", + "--skip_dispatcher_op_registration", + action="store_true", + help="Avoid registering operators into the dispatcher.", + ) + parser.add_argument( + "--force-schema-registration", + "--force_schema_registration", + action="store_true", + help="force it to generate schema-only registrations for all ops, including" + "those that are not listed on --op-registration-whitelist", + ) + parser.add_argument( + "--generate", + type=str, + nargs="*", + choices=["headers", "sources", "declarations_yaml"], + default=["headers", "sources", "declarations_yaml"], + help="Generate only a subset of files", + ) + parser.add_argument( + "--update-aoti-c-shim", + action="store_true", + help="Update AOTInductor C shim after adding an entry to inductor_fallback_ops in torchgen/aoti/fallback_ops.py. " + "WARNING: Do not use this unless you are sure what you are doing!!!", + ) + + options = parser.parse_args() + + selector = get_custom_build_selector( + options.op_registration_whitelist, + options.op_selection_yaml_path, + ) + + native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml") + tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml") + + from torchgen.model import dispatch_keys + + # TODO: stop generating CUDA kernels for non-CUDA builds + ignore_keys = set() + if not options.mps: + ignore_keys.add(DispatchKey.MPS) + + if DispatchKey.MPS in dispatch_keys: + del dispatch_keys[dispatch_keys.index(DispatchKey.MPS)] + + parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path, ignore_keys) + valid_tags = _GLOBAL_PARSE_TAGS_YAML_CACHE[tags_yaml_path] + native_functions, backend_indices = ( + parsed_yaml.native_functions, + parsed_yaml.backend_indices, + ) + + grouped_native_functions = get_grouped_native_functions(native_functions) + + structured_native_functions = [ + g for g in grouped_native_functions if isinstance(g, NativeFunctionsGroup) + ] + native_functions_with_view_groups = get_grouped_by_view_native_functions( + native_functions + ) + view_groups = [ + g + for g in native_functions_with_view_groups + if isinstance(g, NativeFunctionsViewGroup) + ] + + # NB: It is mandatory to NOT use os.path.join here, as the install directory + # will eventually be ingested by cmake, which does not respect Windows style + # path slashes. If you switch this to use os.path.join, you'll get an error + # like: + # + # Syntax error in cmake code when parsing string + # + # C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h + # + # Invalid character escape '\c'. + core_install_dir = f"{options.install_dir}/core" + Path(core_install_dir).mkdir(parents=True, exist_ok=True) + ops_install_dir = f"{options.install_dir}/ops" + Path(ops_install_dir).mkdir(parents=True, exist_ok=True) + aoti_install_dir = f"{options.aoti_install_dir}" + Path(aoti_install_dir).mkdir(parents=True, exist_ok=True) + + core_fm = make_file_manager(options=options, install_dir=core_install_dir) + cpu_fm = make_file_manager(options=options) + cpu_vec_fm = make_file_manager(options=options) + cuda_fm = make_file_manager(options=options) + ops_fm = make_file_manager(options=options, install_dir=ops_install_dir) + aoti_fm = make_file_manager(options=options, install_dir=aoti_install_dir) + + # Only a limited set of dispatch keys get CPUFunctions.h headers generated + # for them; this is the set + functions_keys = { + DispatchKey.CPU, + DispatchKey.CUDA, + DispatchKey.CompositeImplicitAutograd, + DispatchKey.CompositeImplicitAutogradNestedTensor, + DispatchKey.CompositeExplicitAutograd, + DispatchKey.CompositeExplicitAutogradNonFunctional, + DispatchKey.Meta, + } + if options.mps: + functions_keys.add(DispatchKey.MPS) + + if options.backend_whitelist: + dispatch_keys = [ + k + for k in dispatch_keys + if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist + ] + + static_dispatch_idx: list[BackendIndex] = [] + if options.static_dispatch_backend: + static_dispatch_idx = [ + backend_indices[DispatchKey.parse(key)] + for key in options.static_dispatch_backend + ] + for key in options.static_dispatch_backend: + dp_key = DispatchKey.parse(key) + if dp_key not in functions_keys: + functions_keys.add(dp_key) + + if "sources" in options.generate: + gen_source_files( + native_functions=native_functions, + grouped_native_functions=grouped_native_functions, + structured_native_functions=structured_native_functions, + view_groups=view_groups, + selector=selector, + static_dispatch_idx=static_dispatch_idx, + backend_indices=backend_indices, + aoti_fm=aoti_fm, + core_fm=core_fm, + cpu_fm=cpu_fm, + cpu_vec_fm=cpu_vec_fm, + cuda_fm=cuda_fm, + dispatch_keys=dispatch_keys, + functions_keys=functions_keys, + rocm=options.rocm, + force_schema_registration=options.force_schema_registration, + per_operator_headers=options.per_operator_headers, + skip_dispatcher_op_registration=options.skip_dispatcher_op_registration, + update_aoti_c_shim=options.update_aoti_c_shim, + ) + + if "headers" in options.generate: + gen_headers( + native_functions=native_functions, + valid_tags=valid_tags, + grouped_native_functions=grouped_native_functions, + structured_native_functions=structured_native_functions, + static_dispatch_idx=static_dispatch_idx, + selector=selector, + backend_indices=backend_indices, + core_fm=core_fm, + cpu_fm=cpu_fm, + cuda_fm=cuda_fm, + ops_fm=ops_fm, + dispatch_keys=dispatch_keys, + functions_keys=functions_keys, + rocm=options.rocm, + per_operator_headers=options.per_operator_headers, + ) + + if "declarations_yaml" in options.generate: + gen_declarations_yaml(native_functions=native_functions, cpu_fm=cpu_fm) + + if options.output_dependencies: + depfile_path = Path(options.output_dependencies).resolve() + depfile_name = depfile_path.name + depfile_stem = depfile_path.stem + + for fm, prefix in [ + (cpu_fm, ""), + (cpu_vec_fm, "cpu_vec_"), + (core_fm, "core_"), + (cuda_fm, "cuda_"), + (ops_fm, "ops_"), + ]: + varname = prefix + depfile_stem + path = depfile_path.parent / (prefix + depfile_name) + fm.write_outputs(varname, str(path)) + + +if __name__ == "__main__": + main() diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py new file mode 100644 index 00000000000..5ba12f88bdd --- /dev/null +++ b/torchgen/gen_aoti_c_shim.py @@ -0,0 +1,486 @@ +from __future__ import annotations + +import textwrap +from dataclasses import dataclass +from typing import Sequence + +from torchgen.api.types import DispatcherSignature +from torchgen.api.types.signatures import CppSignature, CppSignatureGroup +from torchgen.context import method_with_native_function +from torchgen.model import ( + Argument, + BackendIndex, + BaseTy, + BaseType, + DispatchKey, + FunctionSchema, + ListType, + NativeFunction, + NativeFunctionsGroup, + OperatorName, + OptionalType, + Type, +) +from torchgen.utils import mapMaybe + + +base_type_to_c_type = { + BaseTy.Tensor: "AtenTensorHandle", + BaseTy.bool: "int32_t", # Use int to pass bool + BaseTy.int: "int64_t", + BaseTy.SymInt: "int64_t", # Inductor-generated code won't see a SymInt + BaseTy.Scalar: "double", # Use double to pass both integer and floating point + BaseTy.float: "double", # TODO: how about other floating point types? + BaseTy.str: "const char*", + BaseTy.DeviceIndex: "int32_t", + BaseTy.Layout: "int32_t", # Represent enum as int + BaseTy.MemoryFormat: "int32_t", # Represent enum as int + BaseTy.ScalarType: "int32_t", # Represent enum as int + BaseTy.Generator: "AtenGeneratorHandle", +} + +base_type_to_aten_type = { + BaseTy.Tensor: "at::Tensor", + BaseTy.bool: "bool", + BaseTy.int: "int64_t", + BaseTy.SymInt: "c10::SymInt", + BaseTy.Scalar: "c10::Scalar", + BaseTy.float: "double", + BaseTy.str: "c10::string_view", + BaseTy.DeviceIndex: "c10::DeviceIndex", + BaseTy.Layout: "c10::Layout", + BaseTy.MemoryFormat: "c10::MemoryFormat", + BaseTy.ScalarType: "c10::ScalarType", + BaseTy.Generator: "at::Generator", +} + +base_type_to_callsite_expr = { + BaseTy.Tensor: "*tensor_handle_to_tensor_pointer", + BaseTy.bool: "", + BaseTy.int: "", + BaseTy.SymInt: "", + BaseTy.Scalar: "", + BaseTy.float: "", + BaseTy.str: "", + BaseTy.DeviceIndex: "static_cast", + BaseTy.Layout: "static_cast", + BaseTy.MemoryFormat: "static_cast", + BaseTy.ScalarType: "static_cast", + BaseTy.Generator: "*generator_handle_to_generator_pointer", +} + + +# convert args to C types, names in declarations, and expressions in function bodies +def convert_arg_type_and_name(typ: Type, name: str) -> tuple[list[str], list[str], list[str], list[str]]: # type: ignore[return] + if isinstance(typ, BaseType): + if typ.name in base_type_to_c_type: + return ( + [base_type_to_c_type[typ.name]], + [name], + [base_type_to_aten_type[typ.name]], + [ + f"{base_type_to_callsite_expr[typ.name]}({name})" + if base_type_to_callsite_expr[typ.name] + else name + ], + ) + elif typ.name == BaseTy.Device: + return ( + ["int32_t", "int32_t"], + [name, name + "_index_"], + ["c10::Device"], + [ + f"c10::Device(static_cast({name}), static_cast({name}_index_))" + ], + ) + else: + # TODO: BaseTy.Dimname, etc. + raise NotImplementedError(f"TODO: add support for arg type {repr(typ)}") + elif isinstance(typ, OptionalType): + c_types, names, aten_types, callsite_exprs = convert_arg_type_and_name( + typ.elem, name + ) + j = 0 # index for names + new_aten_types = [] + new_callsite_exprs = [] + for aten_type in aten_types: + # Use pointer to denote optional type + c_types[j] = c_types[j] + "*" + if aten_type.startswith("c10::ArrayRef<"): + # ArrayRef is passed as pointer + size, but no need to add "*" to the size argument + new_aten_types.append(f"::std::optional<{aten_type}>") + base_type = aten_type[len("c10::ArrayRef<") : -1] + new_callsite_exprs.append( + f"pointer_to_optional_list<{base_type}>({names[j]}, {names[j+1]})" + ) + j += 2 + elif aten_type == "c10::Device": + # Device is passed as device_type + device_index + new_aten_types.append("::std::optional") + new_callsite_exprs.append( + f"pointer_to_optional_device({names[j]}, {names[j+1]})" + ) + j += 2 + else: + new_aten_types.append(f"::std::optional<{aten_type}>") + new_callsite_exprs.append( + f"pointer_to_optional<{aten_type}>({names[j]})" + ) + j += 1 + + return ( + c_types, + names, + new_aten_types, + new_callsite_exprs, + ) + elif isinstance(typ, ListType): + # Need to explictly pass the list as pointer + length + c_types, names, aten_types, _ = convert_arg_type_and_name(typ.elem, name) + assert len(c_types) == 1, "ListType with unsupported element type " + repr(typ) + + # The list content should never be modified + c_types[0] = f"const {c_types[0]}*" + c_types.append("int64_t") + name = names[0] + names.append(name + "_len_") + + atype = aten_types[0] + callsite_exprs = [] + if atype == "bool": + # no converter from std::vector to c10::ArrayRef + # construct std::array instead + assert typ.size is not None + callsite_exprs.append(f"pointer_to_list<{typ.size}>({name})") + elif atype == "::std::optional": + # convert from std::vector<::std::optional> to c10::List<::std::optional> + callsite_exprs.append( + f"c10::List<{atype}>(c10::ArrayRef<{atype}>(pointer_to_list<{atype}>({name}, {name}_len_)))" + ) + else: + callsite_exprs.append(f"pointer_to_list<{atype}>({name}, {name}_len_)") + + aten_types = [f"c10::ArrayRef<{t}>" for t in aten_types] + return ( + c_types, + names, + aten_types, + callsite_exprs, + ) + + +def zip_type_and_name(types: list[str], names: list[str]) -> list[str]: + return [typ + " " + name for typ, name in zip(types, names)] + + +# Generate argument declarations and callsite expressions +def gen_arguments(flat_arguments: Sequence[Argument]) -> tuple[list[str], list[str]]: + types = [] + new_names = [] + callsite_exprs = [] + for arg in flat_arguments: + new_types, names, _, new_callsite_exprs = convert_arg_type_and_name( + arg.type, arg.name + ) + types.extend(new_types) + new_names.extend(names) + callsite_exprs.extend(new_callsite_exprs) + return zip_type_and_name(types, new_names), callsite_exprs + + +# Return values are passed out as pointer arguments because all the C shim functions +# are expected to return AOTITorchError. +# Generate returns as declarations and callsite expressions +def gen_returns(schema: FunctionSchema) -> tuple[list[str], list[str]]: + types = [] + names = [] + for idx, ret in enumerate(schema.returns): + names.append(f"ret{idx}") + if isinstance(ret.type, BaseType) and ret.type.name in base_type_to_c_type: + types.append(base_type_to_c_type[ret.type.name] + "*") + else: + raise NotImplementedError( + f"TODO: add support for return type {repr(ret.type)}" + ) + + def convert_return(typ: BaseType, val: str) -> str: + if typ.name == BaseTy.Tensor: + return f"new_tensor_handle(std::move({val}));" + elif typ.name == BaseTy.SymInt: + return f"{val}.expect_int()" + elif typ.name == BaseTy.Scalar: + return f"{val}.toDouble()" + else: + return val + + ret_pointer_can_be_null = False + unambiguous_name = schema.name.unambiguous_name() + for name in [ + "_scaled_dot_product_flash_attention", + "_scaled_dot_product_efficient_attention", + "_scaled_dot_product_cudnn_attention", + "convolution_backward", + ]: + if name in unambiguous_name: + ret_pointer_can_be_null = True + break + + callsite_exprs: list[str] = [] + for idx, ret in enumerate(schema.returns): + tmp = "tmp_result" if len(names) == 1 else f"std::get<{idx}>(tmp_result)" + assert isinstance(ret.type, BaseType) + rval = convert_return(ret.type, tmp) + if ret_pointer_can_be_null: + callsite_exprs.append(f"if ({names[idx]}) {{ *{names[idx]} = {rval}; }}") + else: + callsite_exprs.append(f"*{names[idx]} = {rval};") + + return zip_type_and_name(types, names), callsite_exprs + + +# gen.py generates header first and then src, so caching the result here to avoid duplicate work +declaration_definition_cache: dict[tuple[str, str, str], tuple[str, str]] = {} + + +def gen_declaration_and_definition( + schema: FunctionSchema, device: str, backend_call: str +) -> tuple[str, str]: + func_name = schema.name.unambiguous_name() + + global declaration_definition_cache + if (func_name, device, backend_call) in declaration_definition_cache: + return declaration_definition_cache[(func_name, device, backend_call)] + + if schema.is_out_fn(): + # out_variant has out arguments in the front, and it's ok to ignore return values + # because C shim functions only return AOTITorchError + args, callsite_exprs = gen_arguments( + [*schema.arguments.out, *schema.arguments.flat_non_out] + ) + ret_assignments: list[str] = [] + else: + args, callsite_exprs = gen_arguments(schema.arguments.flat_all) + # ignore return values for inplace ops + ret_declarations, ret_assignments = ( + ([], []) if schema.name.name.inplace else gen_returns(schema) + ) + args.extend(ret_declarations) + + declaration = f"AOTITorchError aoti_torch_{device}_{func_name}({', '.join(args)})" + + tmp_result = "auto tmp_result = " if ret_assignments else "" + ret_assignments_str = "\n" + "\n".join(ret_assignments) if ret_assignments else "" + definition = f""" +{declaration} {{ + AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({{ + {tmp_result}{backend_call}( +{textwrap.indent(', '.join(callsite_exprs), " ")} + );{textwrap.indent(ret_assignments_str, " ")} + }}); +}} +""" + declaration_definition_cache[(func_name, device, backend_call)] = ( + declaration, + definition, + ) + return declaration, definition + + +def gen_static_dispatch_backend_call_signature( + sig: CppSignature | DispatcherSignature, + f: NativeFunction, +) -> CppSignature: + sig = DispatcherSignature.from_schema(f.func) + cpp_sigs = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=False + ) + if sig.symint and f.func.has_symint(): + cpp_sig = cpp_sigs.symint_signature + else: + cpp_sig = cpp_sigs.signature + assert cpp_sig is not None + return cpp_sig + + +def gen_static_dispatch_backend_call( + f: NativeFunction, + backend_index: BackendIndex, +) -> str: + sig = DispatcherSignature.from_schema(f.func) + cpp_sig = gen_static_dispatch_backend_call_signature(sig, f) + return f"at::{backend_index.dispatch_key.lower()}::{cpp_sig.name()}" + + +def get_backend_index_for_aoti( + func: NativeFunction, + func_group_mapping: dict[OperatorName, NativeFunctionsGroup], + dispatch_key: DispatchKey, + backend_indices: dict[DispatchKey, BackendIndex], +) -> BackendIndex | None: + backend_index = None + if backend_indices[dispatch_key].has_kernel(func) or ( + func.structured_delegate is not None + and func.structured_delegate in func_group_mapping + and backend_indices[dispatch_key].has_kernel( + func_group_mapping[func.structured_delegate] + ) + ): + backend_index = backend_indices[dispatch_key] + elif backend_indices[DispatchKey.CompositeExplicitAutograd].has_kernel(func): + # We need to create C shim wrappers for CompositeExplicitAutograd kernels + backend_index = backend_indices[DispatchKey.CompositeExplicitAutograd] + elif backend_indices[DispatchKey.CompositeExplicitAutogradNonFunctional].has_kernel( + func + ): + # We need to create C shim wrappers for CompositeExplicitAutogradNonFunctional kernels + backend_index = backend_indices[ + DispatchKey.CompositeExplicitAutogradNonFunctional + ] + elif backend_indices[DispatchKey.CompositeImplicitAutograd].has_kernel(func): + backend_index = backend_indices[DispatchKey.CompositeImplicitAutograd] + + return backend_index + + +def get_header_for_aoti( + func: NativeFunction, + func_group_mapping: dict[OperatorName, NativeFunctionsGroup], + dispatch_key: DispatchKey, + backend_indices: dict[DispatchKey, BackendIndex], +) -> str | None: + backend_index = get_backend_index_for_aoti( + func, func_group_mapping, dispatch_key, backend_indices + ) + return ( + None + if backend_index is None + else f"#include " + ) + + +def get_fallback_op_name(func: NativeFunction) -> str: + return ( + f"{func.namespace}.{func.func.name.name}.{func.func.name.overload_name}" + if func.func.name.overload_name + else f"{func.namespace}.{func.func.name.name}.default" + ) + + +def gen_c_shim( + func: NativeFunction, + func_group_mapping: dict[OperatorName, NativeFunctionsGroup], + dispatch_key: DispatchKey, + backend_indices: dict[DispatchKey, BackendIndex], + header: bool, +) -> str | None: + backend_index = get_backend_index_for_aoti( + func, func_group_mapping, dispatch_key, backend_indices + ) + if backend_index is None: + return None + + schema = func.func + device = dispatch_key.lower() + backend_call = gen_static_dispatch_backend_call( + func, + backend_index, + ) + + try: + if header: + declaration, _ = gen_declaration_and_definition( + schema, device, backend_call + ) + return f"AOTI_TORCH_EXPORT {declaration};" + else: + _, definition = gen_declaration_and_definition(schema, device, backend_call) + return definition + + except NotImplementedError: + return None + + +@dataclass(frozen=True) +class ShimGenerator: + func_group_mapping: dict[OperatorName, NativeFunctionsGroup] + dispatch_key: DispatchKey + backend_indices: dict[DispatchKey, BackendIndex] + header: bool # True to generate .h and False to generate .cpp + + @method_with_native_function + def __call__( + self, + func: NativeFunction, + ) -> str | None: + result = gen_c_shim( + func, + self.func_group_mapping, + self.dispatch_key, + self.backend_indices, + self.header, + ) + return result + + +def gen_aoti_c_shim( + native_functions: Sequence[NativeFunction], + func_group_mapping: dict[OperatorName, NativeFunctionsGroup], + dispatch_key: DispatchKey, + backend_indices: dict[DispatchKey, BackendIndex], + header: bool, + includes: str = "", +) -> str: + body = "\n".join( + list( + mapMaybe( + ShimGenerator( + func_group_mapping, dispatch_key, backend_indices, header + ), + native_functions, + ) + ) + ) + device = dispatch_key.lower() + + warning = """ +// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND. +// See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details""" + + if header: + return f""" +{warning} + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" {{ +#endif + +{body} + +#ifdef __cplusplus +}} // extern "C" +#endif +""" + + else: + return f""" +{warning} + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#include +#include +#else +{includes} +#endif + +using namespace torch::aot_inductor; + +{body}""" diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py new file mode 100644 index 00000000000..92a897a330f --- /dev/null +++ b/torchgen/gen_backend_stubs.py @@ -0,0 +1,611 @@ +from __future__ import annotations + +import argparse +import os +import re +from collections import Counter, defaultdict, namedtuple +from pathlib import Path +from typing import Sequence + +import yaml + +import torchgen.api.dispatcher as dispatcher +import torchgen.dest as dest +from torchgen.api.types import DispatcherSignature +from torchgen.code_template import CodeTemplate +from torchgen.context import native_function_manager +from torchgen.gen import get_grouped_native_functions, parse_native_yaml +from torchgen.model import ( + BackendIndex, + BackendMetadata, + DispatchKey, + NativeFunction, + NativeFunctionsGroup, + OperatorName, +) +from torchgen.selective_build.selector import SelectiveBuilder +from torchgen.utils import concatMap, context, FileManager, NamespaceHelper, Target +from torchgen.yaml_utils import YamlLoader + + +# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key. +# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping) +ParsedExternalYaml = namedtuple( + "ParsedExternalYaml", + ["backend_key", "autograd_key", "class_name", "cpp_namespace", "backend_indices"], +) + + +def parse_backend_yaml( + backend_yaml_path: str, + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + backend_indices: dict[DispatchKey, BackendIndex], +) -> ParsedExternalYaml: + native_functions_map: dict[OperatorName, NativeFunction] = { + f.func.name: f + for f in concatMap( + lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()), + grouped_native_functions, + ) + } + + with open(backend_yaml_path) as f: + yaml_values = yaml.load(f, Loader=YamlLoader) + assert isinstance(yaml_values, dict) + + valid_keys = [ + "backend", + "class_name", + "cpp_namespace", + "extra_headers", + "supported", + "autograd", + "full_codegen", + "non_native", + "ir_gen", + "symint", + ] + + backend = yaml_values.pop("backend", None) + assert backend is not None, 'You must provide a value for "backend"' + + class_name = yaml_values.pop("class_name", None) + + cpp_namespace = yaml_values.pop("cpp_namespace", None) + assert cpp_namespace is not None, 'You must provide a value for "cpp_namespace"' + + # Mostly just defaulting to false to stick with LazyTensor convention. + use_out_as_primary = yaml_values.pop("use_out_as_primary", False) + assert isinstance( + use_out_as_primary, bool + ), f"You must provide either True or False for use_out_as_primary. Provided: {use_out_as_primary}" + + use_device_guard = yaml_values.pop("device_guard", False) + assert isinstance( + use_device_guard, bool + ), f"You must provide either True or False for device_guard. Provided: {use_device_guard}" + + supported = yaml_values.pop("supported", []) + if supported is None: + supported = [] # Allow an empty list of supported ops + assert isinstance( + supported, list + ), f'expected "supported" to be a list, but got: {supported} (of type {type(supported)})' + + symint = yaml_values.pop("symint", []) + if symint is None: + symint = [] # Allow an empty list of symint ops + assert isinstance( + symint, list + ), f'expected "symint" to be a list, but got: {supported} (of type {type(supported)})' + symint_set = set(symint) + + supported_autograd = yaml_values.pop("autograd", []) + assert isinstance( + supported_autograd, list + ), f'expected "autograd" to be a list, but got: {supported_autograd}' + + # full_codegen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py + full_codegen = yaml_values.pop("full_codegen", []) + supported.extend(full_codegen) + + # non_native is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py + yaml_values.pop("non_native", {}) + + # ir_gen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py + yaml_values.pop("ir_gen", {}) + + assert ( + len(yaml_values.keys()) == 0 + ), f'{backend_yaml_path} contains unexpected keys: {", ".join(yaml_values.keys())}. \ +Only the following keys are supported: {", ".join(valid_keys)}' + + def create_backend_index( + backend_ops: list[str], + symint_ops: set[str], + dispatch_key: DispatchKey, + *, + use_out_as_primary: bool, + use_device_guard: bool, + ) -> BackendIndex: + metadata: dict[OperatorName, BackendMetadata] = {} + for op in backend_ops: + op_name = OperatorName.parse(op) + assert ( + op_name in native_functions_map + ), f"Found an invalid operator name: {op_name}" + # See Note [External Backends Follow Dispatcher API] + kernel_name = dispatcher.name(native_functions_map[op_name].func) + if op in symint_ops: + kernel_name += "_symint" + # TODO: allow structured external backends later. + m = BackendMetadata( + kernel=kernel_name, structured=False, cpp_namespace=cpp_namespace + ) + metadata[op_name] = m + return BackendIndex( + dispatch_key=dispatch_key, + use_out_as_primary=use_out_as_primary, + external=True, + device_guard=use_device_guard, + index=metadata, + ) + + backend_key: DispatchKey | None = None + if len(supported) > 0: + with context( + lambda: f'The provided value for "backend" must be a valid DispatchKey, but got {backend}.' + ): + backend_key = DispatchKey.parse(backend) + + backend_idx = create_backend_index( + supported, + symint_set, + backend_key, + use_out_as_primary=use_out_as_primary, + use_device_guard=use_device_guard, + ) + assert backend_key not in backend_indices + backend_indices[backend_key] = backend_idx + + autograd_key: DispatchKey | None = None + if len(supported_autograd) > 0: + with context( + lambda: f'The "autograd" key was specified, which indicates that you would like to override \ +the behavior of autograd for some operators on your backend. However "Autograd{backend}" is not a valid DispatchKey.' + ): + autograd_key = DispatchKey.parse(f"Autograd{backend}") + + autograd_idx = create_backend_index( + supported_autograd, + symint_set, + autograd_key, + use_out_as_primary=use_out_as_primary, + use_device_guard=use_device_guard, + ) + assert autograd_key not in backend_indices + backend_indices[autograd_key] = autograd_idx + + for g in grouped_native_functions: + if isinstance(g, NativeFunction): + forward_kernels = ( + [] + if backend_key is None + else [ + m + for m in [backend_indices[backend_key].get_kernel(g)] + if m is not None + ] + ) + backward_kernels = ( + [] + if autograd_key is None + else [ + m + for m in [backend_indices[autograd_key].get_kernel(g)] + if m is not None + ] + ) + else: + forward_kernels = ( + [] + if backend_key is None + else [ + m + for m in [ + backend_indices[backend_key].get_kernel(f) + for f in g.functions() + ] + if m is not None + ] + ) + backward_kernels = ( + [] + if autograd_key is None + else [ + m + for m in [ + backend_indices[autograd_key].get_kernel(f) + for f in g.functions() + ] + if m is not None + ] + ) + + forward_kernels = [f for f in forward_kernels if f is not None] + backward_kernels = [f for f in backward_kernels if f is not None] + assert ( + len(forward_kernels) == 0 or len(backward_kernels) == 0 + ), f'Currently, all variants of an op must either be registered to a backend key, or to a backend\'s \ +autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! \ +{forward_kernels[0].kernel} is listed under "supported", but {backward_kernels[0].kernel} is listed under "autograd".' + + return ParsedExternalYaml( + backend_key, autograd_key, class_name, cpp_namespace, backend_indices + ) + + +def error_on_missing_kernels( + native_functions: Sequence[NativeFunction], + backend_indices: dict[DispatchKey, BackendIndex], + backend_key: DispatchKey, + autograd_key: DispatchKey | None, + class_name: str, + kernel_defn_file_path: str, + full_codegen: list[OperatorName] | None = None, +) -> None: + try: + with open(kernel_defn_file_path) as f: + backend_defns = f.read() + except OSError as e: + raise AssertionError( + f"Unable to read from the specified impl_path file: {kernel_defn_file_path}" + ) from e + + if full_codegen is None: + full_codegen = [] + + indices = [backend_indices[backend_key].index] + ( + [] if autograd_key is None else [backend_indices[autograd_key].index] + ) + # Quick mapping from each OperatorName used by the external backend + # to its backend kernel name + expected_backend_op_names: dict[OperatorName, str] = dict( + list( + concatMap( + lambda index: [ + (op_name, metadata.kernel) for op_name, metadata in index.items() + ], + indices, + ) + ) + ) + expected_backend_native_funcs: list[NativeFunction] = [ + f + for f in native_functions + if f.func.name in expected_backend_op_names.keys() + and f.func.name not in full_codegen + ] + expected_backend_kernel_name_counts: dict[str, list[NativeFunction]] = defaultdict( + list + ) + for native_f in expected_backend_native_funcs: + expected_backend_kernel_name_counts[ + expected_backend_op_names[native_f.func.name] + ].append(native_f) + + # This just looks for lines containing "foo(", and assumes that the kernel foo has been implemented. + # It might cause false negatives (we won't catch all cases), but that's ok - if we catch a missing kernel + # here, then we get a nicer error message. If we miss it, you get a linker error. + kernel_defn_regex = rf"(.*){class_name}::\s*([\w\d]*)\(" + actual_backend_kernel_name_counts = Counter( + # A bit unwieldy (this could probably be moved into regex), + # but we don't want to include kernel names that come from function calls, + # like "return torch_xla::XLANativeFunctions::empty_strided_symint(...)". + # Easy check is to ignore any lines with colons before the class name. + [ + y + for (x, y) in re.findall(kernel_defn_regex, backend_defns) + if not x.endswith(":") + ] + ) + + missing_kernels_err_msg = "" + for expected_name, funcs in expected_backend_kernel_name_counts.items(): + expected_overload_count = len(funcs) + actual_overload_count = actual_backend_kernel_name_counts[expected_name] + if expected_overload_count != actual_overload_count: + + def create_decl(f: NativeFunction) -> str: + with native_function_manager(f): + return DispatcherSignature.from_schema(f.func).decl() + + expected_schemas_str = "\n".join([create_decl(f) for f in funcs]) + missing_kernels_err_msg += f""" +{class_name} is missing a kernel definition for {expected_name}. We found {actual_overload_count} kernel(s) with that name, +but expected {expected_overload_count} kernel(s). The expected function schemas for the missing operator are: +{expected_schemas_str} + +""" + assert missing_kernels_err_msg == "", missing_kernels_err_msg + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate backend stub files") + parser.add_argument( + "-s", + "--source-yaml", + "--source_yaml", + help="path to source yaml file containing operator external definitions", + ) + parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory") + parser.add_argument( + "--dry-run", "--dry_run", type=bool, default=False, help="output directory" + ) + parser.add_argument( + "--impl-path", + "--impl_path", + type=str, + default=None, + help="path to the source C++ file containing kernel definitions", + ) + options = parser.parse_args() + + run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path) + + +def gen_dispatchkey_nativefunc_headers( + fm: FileManager, + class_name: str, + cpp_namespace: str, + backend_indices: dict[DispatchKey, BackendIndex], + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + backend_dispatch_key: DispatchKey, + autograd_dispatch_key: DispatchKey | None, + backend_name: str = "", +) -> None: + assert class_name is not None + generated_comment = ( + "Autogenerated file by gen_backend_stubs.py. Do not edit directly!" + ) + + # Convert to a set first to remove duplicate kernel names. + # Backends are allowed to repeat kernel names; only generate the declaration once! + # Sort for deterministic output. + backend_declarations = sorted( + set( + concatMap( + lambda f: dest.compute_native_function_declaration( + f, backend_indices[backend_dispatch_key] + ), + grouped_native_functions, + ) + ) + ) + autograd_declarations = sorted( + set( + concatMap( + lambda f: [] + if autograd_dispatch_key is None + else dest.compute_native_function_declaration( + f, backend_indices[autograd_dispatch_key] + ), + grouped_native_functions, + ) + ) + ) + + ns_helper = NamespaceHelper(cpp_namespace) + fm.write_with_template( + f"{backend_dispatch_key}NativeFunctions.h", + "DispatchKeyNativeFunctions.h", + lambda: { + "generated_comment": generated_comment, + "namespace_prologue": ns_helper.prologue, + "class_name": class_name, + "namespace_epilogue": ns_helper.epilogue, + "dispatch_declarations": backend_declarations + autograd_declarations, + "BackendName": backend_name, + "DispatchKey": backend_dispatch_key, + }, + ) + + +def gen_dispatcher_registrations( + fm: FileManager, + output_dir: str, + class_name: str, + backend_indices: dict[DispatchKey, BackendIndex], + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], + backend_dispatch_key: DispatchKey, + dispatch_key: DispatchKey, + selector: SelectiveBuilder, + # build_in_tree is true for lazy TS backend and affects include paths, not used for external backends + build_in_tree: bool = False, + per_operator_headers: bool = False, + backend_name: str = "", + eager_registration: bool = True, +) -> None: + headers = [ + f"{output_dir}/{backend_dispatch_key}NativeFunctions.h", + ] + if build_in_tree: + external_backend_headers_str = "\n".join(f"#include <{h}>" for h in headers) + else: + external_backend_headers_str = "\n".join(f'#include "{h}"' for h in headers) + + assert class_name is not None + backend_index = backend_indices[dispatch_key] + + dispatch_registrations_body = list( + concatMap( + dest.RegisterDispatchKey( + backend_index, + Target.REGISTRATION, + selector, + rocm=False, + symint=True, + class_method_name=f"{class_name}", + skip_dispatcher_op_registration=False, + ), + grouped_native_functions, + ) + ) + newline = "\n" + ns_helper = NamespaceHelper(namespace_str="at") + deferred_dispatch_registrations = "" + static_init_dispatch_registrations = "" + if eager_registration: + static_template = CodeTemplate( + """\ +TORCH_LIBRARY_IMPL(aten, $dispatch_key, m) { + $dispatch_registrations_body +};""" + ) + static_init_dispatch_registrations = static_template.substitute( + dispatch_key=dispatch_key, + dispatch_registrations_body=dispatch_registrations_body, + ) + else: + deferred_template = CodeTemplate( + """\ +TORCH_API void Register${backend_name}${dispatch_key}NativeFunctions(); +TORCH_API void Register${backend_name}${dispatch_key}NativeFunctions() { + static auto m = MAKE_TORCH_LIBRARY_IMPL(aten, $dispatch_key); + $dispatch_registrations_body +}""" + ) + deferred_dispatch_registrations = deferred_template.substitute( + backend_name=backend_name, + dispatch_key=dispatch_key, + dispatch_registrations_body=dispatch_registrations_body, + ) + + fm.write_with_template( + f"Register{dispatch_key}.cpp", + "RegisterDispatchKey.cpp", + lambda: { + "extra_cuda_headers": "", + "external_backend_headers": external_backend_headers_str, + "ops_headers": "#include " + if not per_operator_headers + else "", + "DispatchKey": dispatch_key, + "dispatch_namespace": dispatch_key.lower(), + "dispatch_headers": dest.gen_registration_headers( + backend_index, per_operator_headers=per_operator_headers, rocm=False + ), + "dispatch_definitions": fm.substitute_with_template( + "RegisterDispatchDefinitions.ini", + lambda: { + "ns_prologue": ns_helper.prologue, + "ns_epilogue": ns_helper.epilogue, + "static_init_dispatch_registrations": static_init_dispatch_registrations, + "deferred_dispatch_registrations": deferred_dispatch_registrations, + "dispatch_helpers": dest.gen_registration_helpers(backend_index), + "dispatch_namespace": dispatch_key.lower(), + "dispatch_namespaced_definitions": "", + "dispatch_anonymous_definitions": list( + concatMap( + dest.RegisterDispatchKey( + backend_index, + Target.ANONYMOUS_DEFINITION, + selector, + rocm=False, + symint=True, + class_method_name=f"{class_name}", + skip_dispatcher_op_registration=False, + ), + grouped_native_functions, + ) + ), + }, + ).split(newline), + }, + ) + + +def run( + source_yaml: str, output_dir: str, dry_run: bool, impl_path: str | None = None +) -> None: + # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py + pytorch_root = Path(__file__).parent.parent.absolute() + template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates") + + def make_file_manager(install_dir: str) -> FileManager: + return FileManager( + install_dir=install_dir, template_dir=template_dir, dry_run=dry_run + ) + + fm = make_file_manager(output_dir) + + native_yaml_path = os.path.join( + pytorch_root, "aten/src/ATen/native/native_functions.yaml" + ) + tags_yaml_path = os.path.join(pytorch_root, "aten/src/ATen/native/tags.yaml") + parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path) + native_functions, backend_indices = ( + parsed_yaml.native_functions, + parsed_yaml.backend_indices, + ) + grouped_native_functions = get_grouped_native_functions(native_functions) + parsed_backend_yaml = parse_backend_yaml( + source_yaml, grouped_native_functions, backend_indices + ) + backend_key = parsed_backend_yaml.backend_key + autograd_key = parsed_backend_yaml.autograd_key + cpp_namespace = parsed_backend_yaml.cpp_namespace + class_name = parsed_backend_yaml.class_name + backend_indices = parsed_backend_yaml.backend_indices + + selector = SelectiveBuilder.get_nop_selector() + + if backend_key is None: + # This could be useful if a backend wants to quickly set up a noop yaml file but doesn't have any kernels ready yet. + return + + if class_name is None: + # class_name is an optional argument to backend yaml file. + # if specified it allows an external backend to override + # the name of the class that all generated kernel definitions live under. + # if not specified, its value is given as native_function_class_name. + class_name = backend_indices[backend_key].native_function_class_name() + assert class_name is not None + + if impl_path is not None: + error_on_missing_kernels( + native_functions, + backend_indices, + backend_key, + autograd_key, + class_name, + impl_path, + ) + + gen_dispatchkey_nativefunc_headers( + fm, + class_name, + cpp_namespace, + backend_indices, + grouped_native_functions, + backend_key, + autograd_key, + ) + + for dispatch_key in ( + [backend_key] if autograd_key is None else [backend_key, autograd_key] + ): + gen_dispatcher_registrations( + fm, + output_dir, + class_name, + backend_indices, + grouped_native_functions, + backend_key, + dispatch_key, + selector, + ) + + +if __name__ == "__main__": + main() diff --git a/torchgen/gen_executorch.py b/torchgen/gen_executorch.py new file mode 100644 index 00000000000..d29713568e6 --- /dev/null +++ b/torchgen/gen_executorch.py @@ -0,0 +1,998 @@ +from __future__ import annotations + +import argparse +import os +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Sequence, TextIO, TYPE_CHECKING + +import yaml + +# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices. +from torchgen import dest +from torchgen.api import cpp as aten_cpp +from torchgen.api.types import CppSignature, CppSignatureGroup, CType, NamedCType +from torchgen.context import ( + method_with_native_function, + method_with_nested_native_function, + with_native_function_and_index, +) +from torchgen.executorch.api import et_cpp +from torchgen.executorch.api.custom_ops import ( + ComputeNativeFunctionStub, + gen_custom_ops_registration, +) +from torchgen.executorch.api.types import contextArg, ExecutorchCppSignature +from torchgen.executorch.api.unboxing import Unboxing +from torchgen.executorch.model import ETKernelIndex, ETKernelKey, ETParsedYaml +from torchgen.executorch.parse import ET_FIELDS, parse_et_yaml, parse_et_yaml_struct +from torchgen.gen import ( + get_custom_build_selector, + get_native_function_declarations, + get_native_function_declarations_from_ns_grouped_kernels, + get_native_function_schema_registrations, + LineLoader, + parse_native_yaml, +) +from torchgen.model import ( + BackendIndex, + BackendMetadata, + DEFAULT_KERNEL_NAMESPACE, + DispatchKey, + FunctionSchema, + Location, + NativeFunction, + NativeFunctionsGroup, + OperatorName, + Variant, +) +from torchgen.utils import ( + context, + FileManager, + make_file_manager, + mapMaybe, + NamespaceHelper, +) + + +if TYPE_CHECKING: + from torchgen.selective_build.selector import SelectiveBuilder + + +def _sig_decl_wrapper(sig: CppSignature | ExecutorchCppSignature) -> str: + """ + A wrapper function to basically get `sig.decl(include_context=True)`. + For ATen kernel, the codegen has no idea about ET contextArg, so we + use this wrapper to add it. + """ + if isinstance(sig, ExecutorchCppSignature): + return sig.decl() + + returns_type = aten_cpp.returns_type(sig.func.returns).cpp_type() + cpp_args = [a.decl() for a in sig.arguments()] + cpp_args_str = ", ".join([contextArg.decl()] + cpp_args) + sig_decl = f"{returns_type} {sig.name()}({cpp_args_str})" + return sig_decl + + +def static_dispatch( + sig: CppSignature | ExecutorchCppSignature, + f: NativeFunction, + backend_indices: list[BackendIndex], +) -> str: + """ + For a given `NativeFunction`, find out the corresponding native function and dispatch to it. If zero or more than one + native function exists, error out. A simplified version of register_dispatch_key.py + Arguments: + sig: A CppSignature for this native function we want to use. + f: NativeFunction to generate static dispatch. + backend_indices: All available backends. + Return: + C++ code to call backend-specific functions, e.g., "return at::native::add(self, other, scale);" + """ + if len(backend_indices) == 0 or f.manual_kernel_registration: + return "" + + backends = [b for b in backend_indices if b.has_kernel(f)] + static_block = None + if len(backends) == 1: + backend_metadata = backends[0].get_kernel(f) + if backend_metadata: + args = ", ".join(a.name for a in sig.arguments()) + # Here we are assuming there's no difference between CppSignature and NativeSignature for Executorch. + static_block = f"return ::{backend_metadata.cpp_namespace}::{backend_metadata.kernel}({args});" + else: + static_block = f""" +ET_ASSERT_UNREACHABLE_MSG("The number of native function(s) binding to {f.func.name} is {len(backends)}."); + """ + return f""" +// {f.namespace}::{f.func} +TORCH_API inline {_sig_decl_wrapper(sig)} {{ + {static_block} +}} +""" + + +# Generates Functions.h, which provides the functional public C++ API, +# and the scaffolding to call into the dispatcher from these functions. +@dataclass(frozen=True) +class ComputeFunction: + static_dispatch_backend_indices: list[BackendIndex] + + selector: SelectiveBuilder + + use_aten_lib: bool + + is_custom_op: Callable[[NativeFunction], bool] + + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + is_method_variant = False + if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"): + return None + + if Variant.function not in f.variants and Variant.method in f.variants: + is_method_variant = True + + # only valid remaining case is only function is in f.variants + elif not (Variant.function in f.variants and Variant.method not in f.variants): + raise Exception( # noqa: TRY002 + f"Can't handle native function {f.func} with the following variant specification {f.variants}." + ) + + sig: CppSignature | ExecutorchCppSignature = ( + CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=f.manual_cpp_binding + ).most_faithful_signature() + if self.use_aten_lib + else ExecutorchCppSignature.from_native_function(f) + ) + if self.use_aten_lib and not self.is_custom_op(f): + comma = ", " + + if is_method_variant: + return f""" +// {f.namespace}::{f.func} +TORCH_API inline {_sig_decl_wrapper(sig)} {{ + return {sig.arguments()[0].name}.{sig.name()}({comma.join(e.name for e in sig.arguments()[1:])}); +}} +""" + else: + return f""" +// {f.namespace}::{f.func} +TORCH_API inline {_sig_decl_wrapper(sig)} {{ + return at::{sig.name()}({comma.join(e.name for e in sig.arguments())}); +}} +""" + + else: + return static_dispatch( + sig, + f, + backend_indices=self.static_dispatch_backend_indices, + ) + + +# Generates RegisterCodegenUnboxedKernels.cpp. +@dataclass(frozen=True) +class ComputeCodegenUnboxedKernels: + selector: SelectiveBuilder + + use_aten_lib: bool + + @method_with_nested_native_function + def __call__( + self, + unbox_kernel_entry: tuple[NativeFunction, tuple[ETKernelKey, BackendMetadata]], + ) -> str: + f: NativeFunction = unbox_kernel_entry[0] + kernel_key: ETKernelKey | list[ETKernelKey] = unbox_kernel_entry[1][0] + kernel_meta: BackendMetadata = unbox_kernel_entry[1][1] + + op_name = f"{f.namespace}::{f.func.name}" + if not self.selector.is_root_operator(op_name): + return "" + + if not isinstance(kernel_key, list): + kernel_key = [kernel_key] + used_kernel_keys = self.selector.et_get_selected_kernels( + op_name, [k.to_native_string() for k in kernel_key] + ) + if not used_kernel_keys: + return "" + sig: CppSignature | ExecutorchCppSignature + argument_type_gen: Callable[..., NamedCType] + return_type_gen: Callable[..., CType] + if self.use_aten_lib: + sig = CppSignatureGroup.from_native_function( + f, method=False, fallback_binding=f.manual_cpp_binding + ).most_faithful_signature() + argument_type_gen = aten_cpp.argumenttype_type + return_type_gen = aten_cpp.returns_type + arguments = sig.arguments() + kernel_call = f"torch::executor::{f.namespace}::{sig.name()}" + else: + sig = ExecutorchCppSignature.from_native_function(f) + argument_type_gen = et_cpp.argumenttype_type + return_type_gen = et_cpp.returns_type + arguments = sig.arguments(include_context=False) + kernel_call = f"{kernel_meta.cpp_namespace}::{kernel_meta.kernel}" + # parse arguments into C++ code + binding_list, code_list = Unboxing( + argument_type_gen=argument_type_gen + ).convert_arguments(arguments) + + # for each C++ argument, generate the conversion code + code_connector = "\n\t" + arg_connector = ", " + + args_str = f"{arg_connector.join(e.name for e in binding_list)}" + event_tracer_output_logging = "" + output_ids = [] + + if len(f.func.returns) == 0: + if len(f.func.arguments.out) == 0: + raise Exception( # noqa: TRY002 + f"Can't handle native function {f.func} with no returns and no out yet." + ) + out = f.func.arguments.out[0] + return_assignment = f"""stack[{len(binding_list)}] = &{out.name};""" + ret_prefix = "" + output_ids = [len(binding_list)] + else: + if len(f.func.arguments.out) == 0: + return_assignment = ( + f"""*stack[{len(binding_list)}] = EValue(result_);""" + ) + ret_prefix = return_type_gen(f.func.returns).cpp_type() + " result_ = " + output_ids = [len(binding_list)] + else: + return_assignment = "" + ret_prefix = "" + output_ids = [ + len(binding_list) - (i + 1) + for i in reversed(range(len(f.func.arguments.out))) + ] + + for output_id in output_ids: + event_tracer_output_logging += ( + f"internal::event_tracer_log_evalue(" + f"context.internal_event_tracer(), " + f"*stack[{output_id}]);\n" + ) + + newline = "\n " + return "\n".join( + [ + f""" +Kernel( + "{f.namespace}::{f.func.name}",{newline + '"' + (k + '",') if k != 'default' else ''} + []({contextArg.defn()}, EValue** stack) {{ + {code_connector.join(code_list)} + + internal::EventTracerProfileOpScope event_tracer_op_scope(context.internal_event_tracer(), "native_call_{f.func.name}"); + EXECUTORCH_SCOPE_PROF("native_call_{f.func.name}"); + {ret_prefix}{kernel_call}(context, {args_str}); + {event_tracer_output_logging} + {return_assignment} + }} +), +""" + for k in used_kernel_keys + ] + ) + + +def gen_unboxing( + *, + native_functions: Sequence[NativeFunction], + cpu_fm: FileManager, + selector: SelectiveBuilder, + use_aten_lib: bool, + kernel_index: ETKernelIndex, + manual_registration: bool, +) -> None: + # Iterable type for write_sharded is a Tuple of (native_function, (kernel_key, metadata)) + def key_func( + item: tuple[NativeFunction, tuple[ETKernelKey, BackendMetadata]] + ) -> str: + return item[0].root_name + ":" + item[1][0].to_native_string() + + items: list[tuple[NativeFunction, tuple[ETKernelKey, BackendMetadata]]] = [ + (native_function, (kernel_key, metadata)) + for native_function in native_functions + for kernel_key, metadata in kernel_index.get_kernels(native_function).items() + ] + + header = ["Functions.h" if use_aten_lib else "NativeFunctions.h"] + filename = ( + "RegisterKernels.cpp" + if manual_registration + else "RegisterCodegenUnboxedKernels.cpp" + ) + cpu_fm.write_sharded( + filename, + items, + key_fn=key_func, + env_callable=lambda unbox_kernel_entry: { + "unboxed_kernels": [ + ComputeCodegenUnboxedKernels(selector, use_aten_lib)(unbox_kernel_entry) + ], + "fn_header": header + if unbox_kernel_entry == items[0] + else [], # Only write header once + }, + num_shards=1, + sharded_keys={"unboxed_kernels", "fn_header"}, + ) + + +@with_native_function_and_index # type: ignore[arg-type] +def compute_native_function_declaration( + g: NativeFunctionsGroup | NativeFunction, kernel_index: ETKernelIndex +) -> list[str]: + assert isinstance(g, NativeFunction) + sig = ExecutorchCppSignature.from_native_function(f=g) + metadata_list = kernel_index.get_kernels(g).values() + if metadata_list is None: + return [] + + # for kernels in lean mode, we declare two versions, one with context and one without. + # In the end we will cleanup the unused one. + def gen_decl(metadata: BackendMetadata, include_context: bool) -> str: + return f"{sig.decl(name=metadata.kernel, include_context=include_context)};" + + return [ + gen_decl(metadata, include_context) + for include_context in [False, True] + for metadata in metadata_list + ] + + +def gen_functions_declarations( + *, + native_functions: Sequence[NativeFunction], + kernel_index: ETKernelIndex, + selector: SelectiveBuilder, + use_aten_lib: bool, + custom_ops_native_functions: Sequence[NativeFunction] | None = None, +) -> str: + """ + Generates namespace separated C++ function API inline declaration/definitions. + Native functions are grouped by namespaces and the generated code is wrapped inside + namespace blocks. + + E.g., for `custom_1::foo.out` in yaml file we will generate a C++ API as a symbol + in `torch::executor::custom_1::foo_out`. This way we avoid symbol conflict when + the other `custom_2::foo.out` is available. + """ + + # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet. + # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex. + + backend_index = kernel_index._to_backend_index() + + ns_grouped_functions = defaultdict(list) + for native_function in native_functions: + ns_grouped_functions[native_function.namespace].append(native_function) + functions_declarations = "" + newline = "\n" + for namespace in ns_grouped_functions: + ns_helper = NamespaceHelper( + namespace_str=namespace, + entity_name="", + max_level=3, + ) + declarations = list( + mapMaybe( + ComputeFunction( + static_dispatch_backend_indices=[backend_index], + selector=selector, + use_aten_lib=use_aten_lib, + is_custom_op=lambda f: custom_ops_native_functions is not None + and f in custom_ops_native_functions, + ), + ns_grouped_functions[namespace], + ) + ) + functions_declarations += f""" +{ns_helper.prologue} +{newline.join(declarations)} +{ns_helper.epilogue} + """ + return functions_declarations + + +def get_ns_grouped_kernels( + *, + native_functions: Sequence[NativeFunction], + kernel_index: ETKernelIndex, + native_function_decl_gen: Callable[ + [ + NativeFunctionsGroup | NativeFunction, + ETKernelIndex, + ], + list[str], + ], +) -> dict[str, list[str]]: + ns_grouped_kernels: dict[str, list[str]] = defaultdict(list) + for f in native_functions: + native_function_namespaces = set() + op_kernels = kernel_index.get_kernels(f) + for backend_metadata in op_kernels.values(): + if backend_metadata: + namespace = backend_metadata.cpp_namespace + native_function_namespaces.add(namespace) + else: + namespace = DEFAULT_KERNEL_NAMESPACE + assert ( + len(native_function_namespaces) <= 1 + ), f"Codegen only supports one namespace per operator, got {native_function_namespaces}" + ns_grouped_kernels[namespace].extend( + native_function_decl_gen(f, kernel_index) + ) + return ns_grouped_kernels + + +def gen_headers( + *, + native_functions: Sequence[NativeFunction], + gen_custom_ops_header: bool, + custom_ops_native_functions: Sequence[NativeFunction], + selector: SelectiveBuilder, + kernel_index: ETKernelIndex, + cpu_fm: FileManager, + use_aten_lib: bool, +) -> None: + """Generate headers. + + Args: + native_functions (Sequence[NativeFunction]): a collection of NativeFunction for ATen ops. + gen_custom_ops_header (bool): whether we should generate CustomOpsNativeFunctions.h + custom_ops_native_functions (Sequence[NativeFunction]): a collection of NativeFunction for custom ops. + kernel_index (ETKernelIndex): kernel collection + cpu_fm (FileManager): file manager manages output stream + use_aten_lib (bool): whether we are generating for PyTorch types or Executorch types. + """ + aten_headers = ["#include "] + backend_indices = {DispatchKey.CPU: kernel_index._to_backend_index()} + if gen_custom_ops_header: + cpu_fm.write_with_template( + "CustomOpsNativeFunctions.h", + "NativeFunctions.h", + lambda: { + "nativeFunctions_declarations": get_native_function_declarations( + grouped_native_functions=custom_ops_native_functions, + backend_indices=backend_indices, + native_function_decl_gen=dest.compute_native_function_declaration, + ), + "headers": [ + "#include ", + "#include ", + ], + }, + ) + aten_headers.append('#include "CustomOpsNativeFunctions.h"') + cpu_fm.write( + "Functions.h", + lambda: { + "static_dispatch_extra_headers": aten_headers + if use_aten_lib + else ['#include "NativeFunctions.h"'], + "Functions_declarations": gen_functions_declarations( + native_functions=native_functions, + kernel_index=kernel_index, + selector=selector, + use_aten_lib=use_aten_lib, + custom_ops_native_functions=custom_ops_native_functions, + ), + }, + ) + cpu_fm.write( + "RegisterKernels.h", + lambda: { + "generated_comment": "@" + "generated by torchgen/gen_executorch.py", + }, + ) + headers = { + "headers": [ + "#include // at::Tensor etc.", + "#include ", + ], + } + if use_aten_lib: + headers["headers"].append("#include // TORCH_API") + cpu_fm.write( + "NativeFunctions.h", + lambda: dict( + { + "nativeFunctions_declarations": get_native_function_declarations( + grouped_native_functions=native_functions, + backend_indices=backend_indices, + native_function_decl_gen=dest.compute_native_function_declaration, + ), + }, + **headers, + ), + ) + else: + ns_grouped_kernels = get_ns_grouped_kernels( + native_functions=native_functions, + kernel_index=kernel_index, + native_function_decl_gen=compute_native_function_declaration, # type: ignore[arg-type] + ) + cpu_fm.write( + "NativeFunctions.h", + lambda: dict( + { + "nativeFunctions_declarations": get_native_function_declarations_from_ns_grouped_kernels( + ns_grouped_kernels=ns_grouped_kernels, + ), + }, + **headers, + ), + ) + + +def gen_custom_ops( + *, + native_functions: Sequence[NativeFunction], + selector: SelectiveBuilder, + kernel_index: ETKernelIndex, + cpu_fm: FileManager, + rocm: bool, +) -> None: + dispatch_key = DispatchKey.CPU + ( + anonymous_definition, + static_init_dispatch_registrations, + ) = gen_custom_ops_registration( + native_functions=native_functions, + selector=selector, + kernel_index=kernel_index, + rocm=rocm, + ) + cpu_fm.write_with_template( + f"Register{dispatch_key}CustomOps.cpp", + "RegisterDispatchKeyCustomOps.cpp", + lambda: { + "ops_headers": '#include "CustomOpsNativeFunctions.h"', + "DispatchKey": dispatch_key, + "dispatch_namespace": dispatch_key.lower(), + "dispatch_namespaced_definitions": "", + "dispatch_anonymous_definitions": anonymous_definition, + "static_init_dispatch_registrations": static_init_dispatch_registrations, + }, + ) + cpu_fm.write_with_template( + f"Register{dispatch_key}Stub.cpp", + "RegisterDispatchKeyCustomOps.cpp", + lambda: { + "ops_headers": "", + "DispatchKey": dispatch_key, + "dispatch_namespace": dispatch_key.lower(), + "dispatch_namespaced_definitions": "", + "dispatch_anonymous_definitions": list( + mapMaybe(ComputeNativeFunctionStub(), native_functions) + ), + "static_init_dispatch_registrations": static_init_dispatch_registrations, + }, + ) + + ( + aten_schema_registrations, + schema_registrations, + ) = get_native_function_schema_registrations( + native_functions=native_functions, + schema_selector=selector, + ) + cpu_fm.write( + "RegisterSchema.cpp", + lambda: { + "schema_registrations": schema_registrations, + "aten_schema_registrations": aten_schema_registrations, + }, + ) + + +def translate_native_yaml( + tags_yaml_path: str, + aten_yaml_path: str, + native_yaml_path: str | None, + use_aten_lib: bool, + out_file: TextIO, +) -> None: + """Translates Executorch DSL dialect to use the same syntax as + native_functions.yaml. The major difference is that Executorch DSL dialect + supports "op" key, where it refers to the operator name in native_functions.yaml. + + For example, a functions.yaml may have the following entry: + + - op: add.out + ... + + It needs to be translated to the following: + + - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + ... + + We go in aten_yaml_path and find the operator schema for "add.out" and add it + to the original functions.yaml. We also add required field "variants", where for + Executorch it will always be "function". + + For ATen mode we don't have to do the translation because native_yaml_path is + the same as native_functions.yaml. + + Args: + tags_yaml_path: Path to a tags.yaml file to satisfy codegen parsing. + It is not optional. + aten_yaml_path: Path to ATen operator yaml file native_functions.yaml. + native_yaml_path: Path to a functions.yaml file to parse. + If the path does not exist in the filesystem, it is treated as an + empty file. If `custom_ops_yaml_path` exists, the contents of that + file are appended to the yaml input to be parsed. + use_aten_lib: We use this flag to determine if we want to generate native + functions. In ATen mode we should generate out= variants. + out_file: The IO object that we are writing into. + Returns: + None + """ + if use_aten_lib: + with open(aten_yaml_path) as aten_yaml: + out_file.writelines(aten_yaml.readlines()) + return + + native_functions, persisted_fields = parse_et_yaml( + aten_yaml_path, + tags_yaml_path, + None, + skip_native_fns_gen=False, + ) + + func_to_scoped_name: dict[FunctionSchema, str] = { + f.func: f"{f.namespace}::{f.func.name}" for f in native_functions + } + op_to_scoped_name: dict[OperatorName, str] = { + func.name: name for func, name in func_to_scoped_name.items() + } + + schema_dict = {name: str(func) for func, name in func_to_scoped_name.items()} + kernel_persist_dict: dict[str, dict[str, Any]] = { + op_to_scoped_name[op]: v for op, v in persisted_fields.items() + } + + if ( + not native_yaml_path + or not os.path.exists(native_yaml_path) + or os.stat(native_yaml_path).st_size == 0 + ): + return + with open(native_yaml_path) as native_yaml: + native_es = yaml.load(native_yaml, Loader=LineLoader) + if not native_es: + return + for e in native_es: + assert isinstance(e.get("__line__"), int), e + loc = Location(native_yaml_path, e.pop("__line__")) + with context(lambda: f"in {loc}:\n "): + if "variants" not in e: + e["variants"] = "function" + if "func" in e: + continue + assert isinstance(e.get("op"), str), e + opname = e.pop("op") + if "::" not in opname: + opname = "aten::" + opname + assert opname in schema_dict + e["func"] = schema_dict.get(opname) + + # Write out persisted kernel information + if opname in kernel_persist_dict: + for k, v in kernel_persist_dict[opname].items(): + e[k] = v + + yaml.dump(native_es, out_file, width=1000) + + +def parse_yaml( + path: str | None, + tags_yaml_path: str, + function_filter: Callable[[NativeFunction], bool], + skip_native_fns_gen: bool = False, +) -> tuple[ + list[NativeFunction], + dict[DispatchKey, dict[OperatorName, BackendMetadata]] | ETKernelIndex, +]: + if path and os.path.exists(path) and os.stat(path).st_size > 0: + with open(path) as f: + es = yaml.load(f, Loader=LineLoader) + + # Check for kernel index structure + kernel_index = ( + parse_et_yaml_struct(es) if any("kernels" in e for e in es) else None + ) + + # Remove ET specific fields from entries for BC compatibility + for entry in es: + for field in ET_FIELDS: + entry.pop(field, None) + + parsed_yaml = parse_native_yaml( + path, + tags_yaml_path, + None, + skip_native_fns_gen=skip_native_fns_gen, + loaded_yaml=es, + ) + native_functions = list(filter(function_filter, parsed_yaml.native_functions)) + op_names = [f.func.name for f in native_functions] + + # (1) Return ETKernelIndex if kernel index is present + if kernel_index is not None: + filtered_index = { + op_name: kernel_mapping + for op_name, kernel_mapping in kernel_index.index.items() + if op_name in op_names + } + return native_functions, ETKernelIndex(index=filtered_index) + + # (2) Return BackendIndices if kernel index is absent + def map_index( + m: dict[OperatorName, BackendMetadata] + ) -> dict[OperatorName, BackendMetadata]: + return {op: m[op] for op in m if op in op_names} + + backend_indices = { + k: map_index(b.index) for (k, b) in parsed_yaml.backend_indices.items() + } + + return native_functions, backend_indices + else: + return [], {} + + +def parse_yaml_files( + tags_yaml_path: str, + aten_yaml_path: str, + native_yaml_path: str | None, + custom_ops_yaml_path: str | None, + selector: SelectiveBuilder, + use_aten_lib: bool, +) -> tuple[ETParsedYaml, ETParsedYaml | None]: + """Parses functions.yaml and custom_ops.yaml files. + + Args: + tags_yaml_path: Path to a tags.yaml file to satisfy codegen parsing. + It is not optional. + aten_yaml_path: Path to ATen operator yaml file native_functions.yaml. + native_yaml_path: Path to a functions.yaml file to parse. + If the path does not exist in the filesystem, it is treated as an + empty file. If `custom_ops_yaml_path` exists, the contents of that + file are appended to the yaml input to be parsed. + custom_ops_yaml_path: Path to a custom_ops.yaml file to parse. If + the path does not exist in the filesystem, it is ignored. + selector: For selective build. + use_aten_lib: We use this flag to determine if we want to generate native + functions. In ATen mode we should generate out= variants. + Returns: + A tuple with two elements: + [0]: The parsed results of concatenating the contents of + `native_yaml_path` and `custom_ops_yaml_path`. + [1]: The parsed results of the contents of `custom_ops_yaml_path`, if + present. If not present, None. + """ + import tempfile + + # only include selected ops, this is because we want to avoid + def function_filter(f: NativeFunction) -> bool: + return selector.is_native_function_selected(f) + + with tempfile.TemporaryDirectory() as tmpdirname: + translated_yaml_path = os.path.join(tmpdirname, "translated.yaml") + with open(translated_yaml_path, "w") as translated: + translate_native_yaml( + tags_yaml_path, + aten_yaml_path, + native_yaml_path, + use_aten_lib, + translated, + ) + + translated_functions, translated_indices = parse_yaml( + translated_yaml_path, tags_yaml_path, function_filter, not use_aten_lib + ) + custom_ops_functions, custom_ops_indices = parse_yaml( + custom_ops_yaml_path, tags_yaml_path, function_filter, True + ) + + # Convert BackendIndices to ETKernelIndex + if not isinstance(translated_indices, ETKernelIndex): + translated_indices = ETKernelIndex.from_backend_indices(translated_indices) + if not isinstance(custom_ops_indices, ETKernelIndex): + custom_ops_indices = ETKernelIndex.from_backend_indices(custom_ops_indices) + + combined_functions = translated_functions + custom_ops_functions + combined_kernel_index = ETKernelIndex.merge_indices( + translated_indices, custom_ops_indices + ) + combined_yaml = ETParsedYaml(combined_functions, combined_kernel_index) + custom_ops_parsed_yaml = ETParsedYaml(custom_ops_functions, custom_ops_indices) + + return combined_yaml, custom_ops_parsed_yaml + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate operator source files") + # Although we don't refer to --source-path directly, make_file_manager() + # expects it to point to a directory that contains a templates/ subdirectory + # containing the file templates. + parser.add_argument( + "-s", + "--source-path", + help="path to source directory for kernel templates", + ) + parser.add_argument( + "--functions-yaml-path", + "--functions_yaml_path", + help="path to the functions.yaml file to use. Optional, but at least " + "one of --functions-yaml-path and --custom-ops-yaml-path must be " + "specified.", + ) + parser.add_argument( + "--custom-ops-yaml-path", + "--custom_ops_yaml_path", + help="path to the custom_ops.yaml file to use. Optional, but at least " + "one of --functions-yaml-path and --custom-ops-yaml-path must be " + "specified.", + ) + parser.add_argument( + "--aten-yaml-path", + "--aten_yaml_path", + help="path to native_functions.yaml file.", + ) + # Note that make_file_manager() also looks at --install-dir. + parser.add_argument( + "-d", + "--install-dir", + "--install_dir", + help="output directory", + default="build/generated", + ) + parser.add_argument( + "-o", + "--output-dependencies", + help="output a list of dependencies into the given file and exit", + ) + # Although we don't refer to --dry-run directly, make_file_manager() looks + # for it. + parser.add_argument( + "--dry-run", + action="store_true", + help="run without writing any files (still updates outputs)", + ) + parser.add_argument( + "--static-dispatch-backend", + "--static_dispatch_backend", + nargs="*", + help="generate static dispatch code for the specific backend (if set)", + ) + parser.add_argument( + "--op-registration-whitelist", + "--op_registration_whitelist", + nargs="*", + help="filter op registrations by the whitelist (if set); " + "each item is `namespace`::`operator name` without overload name; " + "e.g.: aten::empty aten::conv2d ...", + ) + parser.add_argument( + "--op-selection-yaml-path", + "--op_selection_yaml_path", + help="Provide a path to the operator selection (for custom build) YAML " + "that contains the information about the set of selected operators " + "and their categories (training, ...). Each operator is either a " + "full operator name with overload or just a bare operator name. " + "The operator names also contain the namespace prefix (e.g. aten::)", + ) + parser.add_argument( + "--tags-path", + help="Path to tags.yaml. Required by yaml parsing in codegen system.", + ) + parser.add_argument( + "--rocm", + action="store_true", + help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly", + ) + parser.add_argument( + "--use-aten-lib", + "--use_aten_lib", + action="store_true", + help="a boolean flag to indicate whether we use ATen kernels or not, in the future this flag will be per " + "operator", + ) + parser.add_argument( + "--manual_registration", + "--manual-registration", + action="store_true", + help="a boolean flag to indicate whether we want to manually call" + "register_kernels() or rely on static init. ", + ) + parser.add_argument( + "--generate", + type=str, + nargs="*", + choices=["headers", "sources"], + default=["headers", "sources"], + help="Generate only a subset of files", + ) + options = parser.parse_args() + assert options.tags_path, "tags.yaml is required by codegen yaml parsing." + + selector = get_custom_build_selector( + options.op_registration_whitelist, + options.op_selection_yaml_path, + ) + + parsed_yaml, custom_ops_parsed_yaml = parse_yaml_files( + aten_yaml_path=options.aten_yaml_path, + tags_yaml_path=options.tags_path, + native_yaml_path=options.functions_yaml_path, + custom_ops_yaml_path=options.custom_ops_yaml_path, + selector=selector, + use_aten_lib=options.use_aten_lib, + ) + native_functions, kernel_index = ( + parsed_yaml.native_functions, + parsed_yaml.kernel_index, + ) + custom_ops_native_functions = ( + custom_ops_parsed_yaml.native_functions if custom_ops_parsed_yaml else [] + ) + + cpu_fm = make_file_manager(options=options) + + if "headers" in options.generate: + # generate CustomOpsNativeFunctions.h when custom_ops.yaml is present, to match the build system. + gen_headers( + native_functions=native_functions, + gen_custom_ops_header=options.custom_ops_yaml_path, + custom_ops_native_functions=custom_ops_native_functions, + selector=selector, + kernel_index=kernel_index, + cpu_fm=cpu_fm, + use_aten_lib=options.use_aten_lib, + ) + + if "sources" in options.generate: + gen_unboxing( + native_functions=native_functions, + cpu_fm=cpu_fm, + selector=selector, + use_aten_lib=options.use_aten_lib, + kernel_index=kernel_index, + manual_registration=options.manual_registration, + ) + if custom_ops_native_functions: + gen_custom_ops( + native_functions=custom_ops_native_functions, + selector=selector, + kernel_index=kernel_index, + cpu_fm=cpu_fm, + rocm=options.rocm, + ) + + if options.output_dependencies: + depfile_path = Path(options.output_dependencies).resolve() + depfile_name = depfile_path.name + depfile_stem = depfile_path.stem + + for fm, prefix in [ + (cpu_fm, ""), + ]: + varname = prefix + depfile_stem + path = depfile_path.parent / (prefix + depfile_name) + fm.write_outputs(varname, str(path)) + + +if __name__ == "__main__": + main() diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py new file mode 100644 index 00000000000..fbc9459eb5e --- /dev/null +++ b/torchgen/gen_functionalization_type.py @@ -0,0 +1,882 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable, TYPE_CHECKING + +from torchgen.api import cpp, dispatcher +from torchgen.api.translate import translate +from torchgen.api.types import ( + BaseCType, + Binding, + CType, + DispatcherSignature, + FunctionalizationLambda, + iTensorListRefT, + NativeSignature, + OptionalCType, + optionalSymIntArrayRefT, + symIntArrayRefT, + SymIntT, + tensorListT, + tensorT, + VectorCType, + ViewInverseSignature, +) +from torchgen.context import ( + method_with_native_function, + native_function_manager, + with_native_function, + with_native_function_and, +) +from torchgen.model import ( + Argument, + BackendIndex, + BaseTy, + BaseType, + FunctionSchema, + ListType, + NativeFunction, + NativeFunctionsGroup, + NativeFunctionsViewGroup, + Return, + SchemaKind, + SelfArgument, + TensorOptionsArguments, +) +from torchgen.native_function_generation import ( + INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY, + MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT, + OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY, +) +from torchgen.utils import dataclass_repr + + +if TYPE_CHECKING: + from torchgen.selective_build.selector import SelectiveBuilder + + +# Note: [Mutable Ops Not Using Functionalization] +# Ops in this list currently do not work with functionalization and should be fixed. +MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION = ( + OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY + + MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT + + INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY + + [ + # It will be BC-breaking, but we should fix their schemas. + # should be inplace? + "record_stream", + # See Note [resize_ in Functionalization] + "resize_", + "resize_as_", + # This function is used as for testing purposes only. + "_fill_mem_eff_dropout_mask_", + ] +) + +# This file contains codegen that relates to the functionalization pass. +# It includes: +# - gen_functionalization_definition +# Generates dispatcher kernel definitions for the functionalization pass. +# - gen_functionalization_registration +# Generates dispatcher kernel registrations for the functionalization pass. +# - gen_functionalization_view_inverse_declaration +# Generates a declaration for an "inverse view", for every view op +# that is needed in functionalization. We manually implement their definitions. +# - gen_composite_view_copy_kernel +# Generates view_copy() composite kernels for all view_copy operators. + + +# Generates the body of the default composite C++ kernel for a {view}_copy NativeFunction +# See Note [view_copy NativeFunctions] +@dataclass(frozen=True) +class GenCompositeViewCopyKernel: + backend_index: BackendIndex + + @method_with_native_function + def __call__(self, g: NativeFunctionsViewGroup) -> str | None: + if g.view_copy is None: + return None + elif g.view_copy.func.name.name.base != f"{g.view.func.name.name}_copy": + # If the view_copy doesn't match the standard naming scheme of _copy, + # assume it already exists and doesn't need to be generated. + # Example: slice_inverse() with the copy variant named slice_scatter() + # instead of slice_inverse_copy() + return None + + metadata = self.backend_index.get_kernel(g.view_copy) + assert metadata is not None + + # We can make view_copy work in more cases by using reshape() + # when a normal view call would ordinarily fail. + # This also makes LTC more efficient, because they don't need to include + # clone() calls in their graph (which is normally needed by reshape). + if str(g.view_copy.func.name) == "view_copy": + assert metadata.kernel == "view_copy_symint" + return """\ +at::Tensor view_copy_symint(const at::Tensor & self, at::SymIntArrayRef size) { + c10::SymDimVector shape = infer_size_dv(size, self.sym_numel()); + if (!at::detail::computeStride(self.sym_sizes(), self.sym_strides(), shape).has_value()) { + return self.reshape_symint(size); + } else { + auto output = at::_ops::view::call(self, size); + return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous); + } +} +""" + # view_copy is a native signature, since we're generating an at::native:: kernel + # Functionalization always operates on symints though + view_copy_sig = NativeSignature( + g.view_copy.func, symint=metadata.supports_symint() + ) + + # view is a dispatcher signature, since we're calling into the at::_ops API + view_sig = DispatcherSignature(g.view.func) + + view_api_name = g.view.func.name.unambiguous_name() + exprs = ", ".join( + [e.expr for e in translate(view_copy_sig.arguments(), view_sig.arguments())] + ) + + # view ops today always return either a Tensor or a list of Tensors + assert len(g.view.func.returns) == 1 + assert g.view.func.returns[0].type == BaseType( + BaseTy.Tensor + ) or g.view.func.returns[0].type == ListType(BaseType(BaseTy.Tensor), None) + + if g.view.func.returns[0].type == BaseType(BaseTy.Tensor): + return_cloned_output = """\ + return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous);""" + else: + # If the return type is a list, we need to clone each tensor in the list. + return_cloned_output = f"""\ + {view_copy_sig.returns_type().cpp_type()} out_clone; + for (const auto i : c10::irange(output.size())) {{ + out_clone.push_back(output[i].clone(/*memory_format=*/at::MemoryFormat::Contiguous)); + }} + return out_clone;""" + + # The default generated composite kernel for {view}_copy() operators just clones + # the input tensor, and runs the underlying view on the clone. + return f""" +{view_copy_sig.defn(name=metadata.kernel)} {{ + auto output = at::_ops::{view_api_name}::call({exprs}); + {return_cloned_output} +}} +""" + + +def return_str(rets: tuple[Return, ...], names: list[str]) -> str: + assert len(rets) == len(names) + if len(rets) == 0: + return "" + elif len(rets) == 1: + return f"return {names[0]};" + else: + return f"return {dispatcher.returns_type(rets).cpp_type()}({', '.join(names)});" + + +def modifies_arguments(f: NativeFunction) -> bool: + return any( + a.annotation is not None and a.annotation.is_write + for a in f.func.arguments.flat_all + ) + + +def wrapper_name(func: FunctionSchema) -> str: + if func.name.overload_name: + return f"{cpp.name(func)}_{func.name.overload_name}" + else: + return cpp.name(func) + + +def is_tensor_like(a: Argument | TensorOptionsArguments | SelfArgument) -> bool: + return isinstance(a, SelfArgument) or ( + isinstance(a, Argument) and a.type.is_tensor_like() + ) + + +# We need to wrap / unwrap various arguments from the op in the functionalization kernels. +# Some op schemas include non-owning types though (like TensorList), +# and when we unwrap them we expect to get out an owning type!. +# We also return a lambda that tells you how to conver the non-owning type argument into the owning type. +def get_owning_type(t: CType) -> tuple[CType, Callable[[str], str]]: + if t == BaseCType(tensorListT): + return VectorCType(BaseCType(tensorT)), lambda x: f"{x}.vec()" + if t == BaseCType(iTensorListRefT): + return VectorCType(BaseCType(tensorT)), lambda x: f"{{{x}.begin(), {x}.end()}}" + # There are technically other non-owning types out there (like IntArrayRef), + # but functionalization only actually cares about the ones involving tensors. + return t, lambda x: x + + +# unwraps all tensor-like arguments, returning: +# (1) a string containing all of the logic that does the unwrapping +# (2) a context, to be used by translate(), with all of the relevant bindings. +def unwrap_tensor_args( + sig: DispatcherSignature, *, is_view_op: bool +) -> tuple[str, list[Binding]]: + context: list[Binding] = [] + unwrapped_tensor_args: list[str] = [] + for arg in sig.arguments(): + if is_tensor_like(arg.argument): + # for tensor inputs, we want to unwrap them before passing them into the redispatch calls. + unwrapped_name = f"{arg.name}_" + # For most ops, the functionalization needs to sync any pending updates on the input tensors + # before calling the operator, since otherwise the operator will act on stale data. + # For view ops though, we can continue to defer syncing until the tensor is used by + # a non-view operator. + maybe_sync_input = ( + "" if is_view_op else f"at::functionalization::impl::sync({arg.name});" + ) + unwrapped_type, conversion_fn = get_owning_type( + arg.nctype.remove_const_ref().type + ) + unwrapped_tensor_args.append( + f""" + {unwrapped_type.cpp_type()} {unwrapped_name}; + if (at::functionalization::impl::isFunctionalTensor({arg.name})) {{ + {maybe_sync_input} + {unwrapped_name} = at::functionalization::impl::from_functional_tensor({arg.name}); + }} else {{ + {unwrapped_name} = {conversion_fn(arg.name)}; + }}""" + ) + context.append(arg.with_name(unwrapped_name)) + else: + # for non-tensor inputs, we want to pass them directly into the redispatch calls. + context.append(arg) + unwrap_tensor_args_str = "\n ".join(unwrapped_tensor_args) + return unwrap_tensor_args_str, context + + +# converts all tensor-like arguments to meta tensors, which are used to compute stride info. Returns: +# (1) a string containing all of the logic that does the conversions. +# (2) a context, to be used by translate(), with all of the relevant bindings. +def convert_to_meta_tensors(sig: DispatcherSignature) -> tuple[str, list[Binding]]: + context: list[Binding] = [] + unwrapped_tensor_args: list[str] = [] + for arg in sig.arguments(): + if is_tensor_like(arg.argument): + # for tensor inputs, we want to unwrap them before passing them into the redispatch calls. + a_ = arg.name + unwrapped_name = f"{arg.name}_meta" + unwrapped_tensor_args.append(f"auto {unwrapped_name} = to_meta({a_});") + context.append(arg.with_name(unwrapped_name)) + else: + # for non-tensor inputs, we want to pass them directly into the redispatch calls. + context.append(arg) + unwrap_tensor_args_str = "\n ".join(unwrapped_tensor_args) + return unwrap_tensor_args_str, context + + +# The functionalization codegen currently expects view op schemas to have this form: +# foo(Tensor(a), ...) -> Tensor(a) (e.g. transpose) +# foo(Tensor(a!), ...) -> Tensor(a!) (e.g. transpose_) +def assert_view_op_properties(func: FunctionSchema) -> None: + def is_alias(a: Argument) -> bool: + return a.annotation is not None + + args = func.arguments.flat_non_out + # The first argument is a tensor with an alias semantics (annotations) + assert len(args) > 0 and args[0].type == BaseType( + BaseTy.Tensor + ), f"""In the functionalization codegen, we expect the first argument of every view operator to be a tensor, +but found an argument of type {str(args[0].type)} for operator: {str(func.name)}.""" + # No other arguments have aliasing semantics + assert is_alias(args[0]) and not any( + is_alias(a) for a in args[1:] + ), """In the functionalization codegen, we expect the first argument of every view operator to alias the output. +View operators with multiple aliasing inputs aren't supported yet. Found an operator that doesn't satisfy this constraint""" + + +# One-liner expression for checking if an expression expr of type type has any +# symbolic values. +def emit_expr_has_symbolic_values(expr: str, type: CType) -> str: + if type == BaseCType(SymIntT): + return f"{expr}.is_symbolic()" + + if isinstance(type, OptionalCType): + innerexpr = f"(*{expr})" + return f"{expr}.has_value() ? {emit_expr_has_symbolic_values(innerexpr, type.elem)} : false" + + if type == BaseCType(optionalSymIntArrayRefT): + return emit_expr_has_symbolic_values( + expr, OptionalCType(BaseCType(symIntArrayRefT)) + ) + + if type in (BaseCType(symIntArrayRefT), VectorCType(BaseCType(SymIntT))): + argname = "arg" + lambda_check = emit_expr_has_symbolic_values(argname, BaseCType(SymIntT)) + return ( + "std::any_of(" + f"{expr}.begin(), {expr}.end(), " + f"[=](auto& {argname}) {{ return {lambda_check}; }})" + ) + + raise ValueError( + "unsupported type for has_symbolic_values check. " + "It should be a SymInt or a collection of those. " + f"Got: {type.cpp_type()}" + ) + + +# Detects whether any of the SymInt arguments are, in fact, symbolic values. +# This is used in the constructor of ViewMeta. +def emit_has_symbolic_inputs(sig: DispatcherSignature) -> tuple[str, str]: + name = "has_symbolic_inputs" + statements = [ + f"{name} = {name} | ({emit_expr_has_symbolic_values(binding.name, binding.nctype.type)});" + for binding in sig.arguments() + if ( + isinstance(binding.argument, Argument) + and binding.argument.type.is_symint_like() + ) + ] + body = "\n ".join(statements) + return ( + name, + f""" + bool {name} = false; + {body}""", + ) + + +# Generates the Functionalization kernel for: +# - ops that create aliases (e.g. transpose()) +# - ops that are views AND mutations (e.g. transpose_()) +def emit_view_functionalization_body( + g: NativeFunctionsViewGroup, *, view_inplace: bool +) -> str: + if view_inplace: + # This op is both an inplace op AND a view op. + # See Note [Functionalization Pass - Inplace View Ops] for details. + # I currently have the view meta call into the out-of-place variant of the view, to avoid + # having to define an extra ~20 inplace {view}_inverse_ functions. + # Most view ops don't have NativeFunctionGroup's both, because we don't define out= variants for view ops. + # I'm assuming that every inplace-view op has a corresponding out-of-place view op, + # with the same name but the trailing underscore removed. + # This is currently asserted at parse time in gen.py (see error_check_native_functions). + assert g.view_inplace is not None + f = g.view_inplace + else: + f = g.view + + assert g.view_copy is not None + with native_function_manager(f): + call_sig = DispatcherSignature.from_schema(g.view_copy.func) + + # the "view_copy" op name that the functionalization kernels need to call + api_name = g.view_copy.func.name.unambiguous_name() + # Sometimes the functionalization pass needs to no-op (e.g. if it was passed non-functional tensors) + # "no-op"ing in this context is just redispatching to the original op. + noop_api_name = f.func.name.unambiguous_name() + + dispatcher_sig = DispatcherSignature.from_schema(f.func) + assert_view_op_properties(f.func) + view_tensor_name = dispatcher_sig.arguments()[0].name + + return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type() + + unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args( + dispatcher_sig, is_view_op=True + ) + view_redispatch_args = [ + e.expr + for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False) + ] + + forward_lambda = FunctionalizationLambda.from_func(g, is_reverse=False) + reverse_lambda = FunctionalizationLambda.from_func(g, is_reverse=True) + + # The meta API call should use the same arguments, but convert all tensors to meta tensors first. + meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig) + meta_call_args = [ + e.expr for e in translate(meta_call_ctx, call_sig.arguments(), method=False) + ] + + ( + symbolic_inputs_varname, + symbolic_inputs_check, + ) = emit_has_symbolic_inputs(call_sig) + + if "inplace_view" in f.tags: + # See Note [Functionalization Pass - Inplace View Ops] for more details + return f""" + {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{ + if (!at::functionalization::impl::isFunctionalTensor({view_tensor_name})) {{ + // functionalization is re-entrant, but will no-op if it wasn't passed a FunctionalTensorWrapper. + {unwrap_tensor_args_str} + at::AutoDispatchSkipFunctionalize guard; + return at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)}); + }} + auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); + auto inverse_return_mode = ( + reapply_views ? at::functionalization::InverseReturnMode::ViewOrScatterInverse + : at::functionalization::InverseReturnMode::NeverView + ); + {symbolic_inputs_check} + at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( + {forward_lambda.decl()} {{ + if (reapply_views) {{ + return {forward_lambda.inner_call(reapply_views=True)} + }} else {{ + return {forward_lambda.inner_call(reapply_views=False)} + }} + }}, + {reverse_lambda.decl()} {{ + return {reverse_lambda.inner_call()} + }}, + /*has_symbolic_inputs=*/{symbolic_inputs_varname} + ); + auto compute_reference_meta = + {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) || + {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit); + {return_type} reference_tensor_output; + if (compute_reference_meta) {{ + {meta_conversion_str} + at::AutoDispatchSkipFunctionalize func_guard; + c10::impl::ExcludeDispatchKeyGuard guard(exclude_keys_for_meta_dispatch); + reference_tensor_output = at::_ops::{noop_api_name}::call({', '.join(meta_call_args)}); + }} + // This function adds the above view meta to the current tensor and replays them off the base, + // mutating the size/stride info of the current FunctionalTensorWrapper. + // Because of this, we need to make sure to run the reference shape function above, + // BEFORE doing this (otherwise we'll end up runnin the reference function using the wrong sizes/strides) + at::functionalization::impl::mutate_view_meta({view_tensor_name}, view_meta); + // See Note [Propagating strides in the functionalization pass] + // XLA/LTC don't implement the logic to propagate strides correctly, so we need to rely + // on a reference implementation here (instead of relying on the output from the forward lambda + // having the correct stride info) + if (compute_reference_meta) {{ + at::functionalization::impl::set_sizes_strides_offset({view_tensor_name}, reference_tensor_output); + }} + return {view_tensor_name}; + }} +""" + + else: + is_multi_output_view = isinstance(f.func.returns[0].type, ListType) + return f""" + {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{ + {unwrap_tensor_args_str} + if (!at::functionalization::impl::isFunctionalTensor({view_tensor_name})) {{ + // functionalization is re-entrant, but will no-op if it wasn't passed a FunctionalTensorWrapper. + at::AutoDispatchSkipFunctionalize guard; + return at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)}); + }} + auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); + auto inverse_return_mode = ( + reapply_views ? at::functionalization::InverseReturnMode::ViewOrScatterInverse + : at::functionalization::InverseReturnMode::NeverView + ); + auto compute_reference_meta = + {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) || + {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit); + {return_type} reference_tensor_output; + if (compute_reference_meta) {{ + {meta_conversion_str} + at::AutoDispatchSkipFunctionalize func_guard; + c10::impl::ExcludeDispatchKeyGuard guard(exclude_keys_for_meta_dispatch); + reference_tensor_output = at::_ops::{noop_api_name}::call({', '.join(meta_call_args)}); + }} + {return_type} tmp_output; + {{ + at::AutoDispatchSkipFunctionalize guard; + if (reapply_views) {{ + tmp_output = at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)}); + }} else {{ + tmp_output = at::_ops::{api_name}::call({', '.join(view_redispatch_args)}); + }} + }} + {symbolic_inputs_check} + at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( + {forward_lambda.decl()} {{ + if (reapply_views) {{ + return {forward_lambda.inner_call(reapply_views=True)} + }} else {{ + return {forward_lambda.inner_call(reapply_views=False)} + }} + }}, + {reverse_lambda.decl()} {{ + return {reverse_lambda.inner_call()} + }}, + /*has_symbolic_inputs=*/{symbolic_inputs_varname}, + /*is_multi_output=*/{str(is_multi_output_view).lower()}, + /*is_as_strided=*/{str(str(f.func.name) == 'as_strided').lower()} + ); + auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta); + // See Note [Propagating strides in the functionalization pass] + if (compute_reference_meta) {{ + at::functionalization::impl::set_sizes_strides_offset(out, reference_tensor_output); + }} + return out; + }} +""" + + +def maybe_create_output(f: NativeFunction, var_name: str) -> str: + if len(f.func.returns) == 0: + return "" + return_type = dispatcher.returns_type(f.func.returns).remove_const_ref().cpp_type() + return f"{return_type} {var_name} = " + + +# Given a NativeFunction, and a variable name corresponding to the output of redispatching on the function, +# this returns two lists of names, consisting of: +# - the names of returns corresponding to the original (mutable) inputs of the outer function +# - the names of returns corresponding to the (immutable) outputs of the inner redispatched function +def get_mutable_redispatch_return_names( + f: NativeFunction, inner_return_var: str +) -> tuple[list[str], list[str]]: + aliased_returns = [] + non_aliased_returns = [] + for i, name in enumerate(f.func.aliased_return_names()): + if name is not None: + aliased_returns.append(name) + else: + non_aliased_returns.append( + inner_return_var + if len(f.func.returns) == 1 + else f"std::get<{i}>({inner_return_var})" + ) + return aliased_returns, non_aliased_returns + + +# When functionalization "no-op's" and redispatches on a mutable operator, we need to take care so that: +# - For fresh outputs, we return the result of the redispatch (without wrapping outputs) +# - For outputs that were aliased to inputs, we return the inputs directly (since some of them might have been wrapped) +def return_from_mutable_noop_redispatch( + f: NativeFunction, inner_return_var: str +) -> str: + aliased, non_aliased = get_mutable_redispatch_return_names(f, inner_return_var) + # Just get all of the return names, and immediately return them + return return_str(f.func.returns, aliased + non_aliased) + + +def wrap_propagate_mutations_and_return( + f: NativeFunction, functional_op: NativeFunction, inner_return_var: str +) -> str: + mutable_arg_names = f.func.arguments.mutable_arg_names() + ( + aliased_outer_rets, + non_aliased_outer_rets, + ) = get_mutable_redispatch_return_names(f, inner_return_var) + _, non_aliased_inner_rets = get_mutable_redispatch_return_names( + functional_op, inner_return_var + ) + # The outer function may have a mix of aliased and non-aliased outputs, + # But the inner functional op that we're transforming to should only have non-aliased outputs + assert len(mutable_arg_names) + len(non_aliased_outer_rets) == len( + non_aliased_inner_rets + ) + + # First, take all of the newly created outputs from the inner call and wrap them into functional tensors + updates = [] + non_aliased_wrapped_ret_names = [] + for i, inner_ret in enumerate( + non_aliased_inner_rets[: len(non_aliased_outer_rets)] + ): + ret_name = f"output_{i}" + updates.append( + f"""\ + auto output_{i} = at::functionalization::impl::to_functional_tensor({inner_ret});""" + ) + non_aliased_wrapped_ret_names.append(ret_name) + + # Next, take all of the mutated outputs from the inner call corresponding to mutated inputs, + # and propagate the mutations + for outer_arg, inner_ret in zip( + mutable_arg_names, non_aliased_inner_rets[len(non_aliased_outer_rets) :] + ): + updates.append( + f"""\ + auto {outer_arg}_inner = at::functionalization::impl::from_functional_tensor({outer_arg}); + at::functionalization::impl::replace_({outer_arg}, {inner_ret}); + at::functionalization::impl::commit_update({outer_arg}); + at::functionalization::impl::sync({outer_arg}); + auto {outer_arg}_inner_updated = at::functionalization::impl::from_functional_tensor({outer_arg}); + at::functionalization::impl::propagate_xla_data_direct({outer_arg}_inner, {outer_arg}_inner_updated);""" + ) + + # Finally, we return: + # - Any mutable arguments that also returns + # - Any immutable returns that were created wrapping the output from the inner call + returns_str = return_str( + f.func.returns, aliased_outer_rets + non_aliased_wrapped_ret_names + ) + updates_str = "\n".join(updates) + return f"""\ +{updates_str} + {returns_str}""" + + +# Generates the Functionalization kernel for: +# - mutation ops (inplace and out= ops) +@with_native_function_and +def emit_inplace_functionalization_body( + f: NativeFunction, g: NativeFunctionsGroup +) -> str: + # mutation case + assert modifies_arguments(f) + + dispatcher_sig = DispatcherSignature.from_schema(f.func) + + unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args( + dispatcher_sig, is_view_op=False + ) + + mutated_names = [ + a.name + for a in f.func.arguments.flat_all + if a.type.is_tensor_like() and a.annotation is not None + ] + non_mutated_names = [ + a.name + for a in f.func.arguments.flat_all + if a.type.is_tensor_like() and a.annotation is None + ] + non_mutated_tensor_names = [ + a.name + for a in f.func.arguments.flat_all + if a.type == BaseType(BaseTy.Tensor) and a.annotation is None + ] + # all mutable inputs must be functional tensors in order to participate in functionalization + check_all_mutated_args_are_functional = " && ".join( + ["true"] + + [ + f"at::functionalization::impl::isFunctionalTensor({a})" + for a in mutated_names + ] + ) + check_any_non_mutated_args_are_functional = " || ".join( + ["false"] + + [ + f"at::functionalization::impl::isFunctionalTensor({a})" + for a in non_mutated_names + ] + ) + + check_any_non_mutated_tensors_are_xla = " || ".join( + ["false"] + + [ + f"{a}.device().type() == c10::DeviceType::XLA" + for a in non_mutated_tensor_names + ] + ) + # These are used in the cases where we don't functionalize and redispatch to the inplace op + # case 1: we hit an inplace op that doesn't have an out-of-place equivalent + # case 2: we hit an inplace ops but our inputs are not functional tensors (in which case our kernel just no-ops) + inplace_exprs = [ + e.expr + for e in translate(unwrapped_args_ctx, dispatcher_sig.arguments(), method=False) + ] + + # call the out-of-place variant of the op + return_type = ( + dispatcher.returns_type(g.functional.func.returns).remove_const_ref().cpp_type() + ) + functional_sig = DispatcherSignature.from_schema(g.functional.func) + functional_exprs = [ + e.expr + for e in translate(unwrapped_args_ctx, functional_sig.arguments(), method=False) + ] + + if f.func.is_out_fn(): + mutable_input_post_processing = "\n".join( + [ + f""" + at::functionalization::impl::replace_( + {a.name}, {'std::get<' + str(i) + '>(tmp_output)' if len(f.func.returns) > 1 else 'tmp_output'}); + at::functionalization::impl::commit_update({a.name});""" + for (i, a) in enumerate(f.func.arguments.out) + if a.annotation and a.annotation.is_write and a.type.is_tensor_like() + ] + ) + else: + mutable_input_post_processing = "\n".join( + [ + f""" + at::functionalization::impl::replace_({a.name}, tmp_output); + at::functionalization::impl::commit_update({a.name});""" + for a in f.func.arguments.flat_all + if a.annotation and a.annotation.is_write and a.type.is_tensor_like() + ] + ) + + meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig) + # We don't want to run the inplace meta func for ops like .set_(), because: + # (1) they're unnecessary: inplace meta checks are only useful for ops like add_(), + # where broadcasting will work for the out-of-place case but should fail on the inplace call + # (2) They'll also fail without adding extra infra: we'd need to convert the input storage argument + # into a meta storage + any_storage_args = any( + a.type == BaseType(BaseTy.Storage) for a in f.func.arguments.flat_all + ) + + return f""" + {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{ + if ({str(not any_storage_args and f.func.kind() == SchemaKind.inplace).lower()}) {{ + // Before converting the mutable op to its functional variant, run meta tensors through the original op. + // This will help us catch shape errors that apply to inplace ops that wouldn't apply to their functional variants. + // (We can only do this for inplace ops today though, because they technically all support meta tensors). + {meta_conversion_str} + at::AutoDispatchSkipFunctionalize func_guard; + c10::impl::ExcludeDispatchKeyGuard guard(exclude_keys_for_meta_dispatch); + at::_ops::{f.func.name.unambiguous_name()}::call({', '.join(a.name for a in meta_call_ctx)}); + }} + {unwrap_tensor_args_str} + if (!({check_all_mutated_args_are_functional})) {{ + // We want to disable this check if there are any XLA tensors. + // cpu_tensor.copy_(xla_tensor) is valid code. + if (!({check_any_non_mutated_tensors_are_xla}) && ({check_any_non_mutated_args_are_functional})) {{ + // case 1: trying to mutate a non functional tensor with a functional tensor is an error + TORCH_INTERNAL_ASSERT(false, + "mutating a non-functional tensor with a functional tensor is not allowed.", + " Please ensure that all of your inputs are wrapped inside of a functionalize() call."); + }} else {{ + // case 2: arguments are not functional tensors, so we no-op and redispatch. + at::AutoDispatchSkipFunctionalize guard; + {maybe_create_output(f, 'tmp_output')}at::_ops::{f.func.name.unambiguous_name()}::call({', '.join(inplace_exprs)}); + {return_from_mutable_noop_redispatch(f, 'tmp_output')} + }} + }} else {{ + {return_type} tmp_output; + {{ + at::AutoDispatchSkipFunctionalize guard; + tmp_output = at::_ops::{g.functional.func.name.unambiguous_name()}::call({', '.join(functional_exprs)}); + }} + {wrap_propagate_mutations_and_return(f, g.functional, 'tmp_output')} + }} + }}""" + + +# The below functions generate RegisterFunctionalization.cpp +# These files provide the kernels that run the functionalization pass, which can be opted into +# per backend (e.g. XLA or Vulkan), or as a composable transform (functionalize() in functorch). + + +# See Note [Functionalization Pass: View Inverses]. +def gen_functionalization_view_inverse_declaration( + selector: SelectiveBuilder, g: NativeFunctionsViewGroup +) -> str | None: + # For every (non-composite) view op, we need a corresponding "inverse view" function. + # This generates the declarations so we get a good compiler error when someone adds a new view. + @with_native_function + def emit_decl_helper(g: NativeFunctionsViewGroup) -> str | None: + if g.view.has_composite_implicit_autograd_kernel: + return None + view_inverse_sig = ViewInverseSignature(g) + return view_inverse_sig.decl() + + return emit_decl_helper(g) + + +def gen_functionalization_registration( + selector: SelectiveBuilder, + g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup, + composite_implicit_autograd_index: BackendIndex, +) -> list[str]: + @with_native_function + def emit_registration_helper(f: NativeFunction) -> str: + assert not f.has_composite_implicit_autograd_kernel + registration_str = f"TORCH_FN(functionalization::{wrapper_name(f.func)})" + return f'm.impl("{f.func.name}", {registration_str});' + + # Don't generate kernels in mobile build + if not selector.include_all_operators: + return [] + + if isinstance(g, NativeFunctionsViewGroup): + # functionalization needs to register kernels for view + view_inplace ops + # See Note [Functionalization <> torch.Tensor constructor] + if str(g.view.func.name) == "lift_fresh": + return [] + view_str = [] + if not g.view.has_composite_implicit_autograd_kernel: + view_str.append(emit_registration_helper(g.view)) + if ( + g.view_inplace is not None + and not g.view_inplace.has_composite_implicit_autograd_kernel + ): + assert g.view_inplace.is_view_op + view_str.append(emit_registration_helper(g.view_inplace)) + return view_str + + elif isinstance(g, NativeFunctionsGroup): + # Gets a hand-written functionalization kernel + if g.inplace is not None and str(g.inplace.func.name) == "set_.source_Tensor": + fns = [] + else: + fns = list(g.functions()) + else: + if str(g.func.name) in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION: + return [] + fns = [g] + + registrations = [] + for f in fns: + if f.has_composite_implicit_autograd_kernel: + continue + if str(f.func.name) == "lift": + # See Note [Functionalization <> torch.Tensor constructor] + return [] + if str(f.func.name) == "resize_": + # See Note [resize_ in Functionalization] + return [] + if str(f.func.name.name) != "set_": + assert not f.is_view_op + # functionalization needs to generate and register kernels for inplace ops. + # We *also* need to directly register CompositeImplicitAUtograd kernels + # so that they decompose properly before functioanlization. + if modifies_arguments(f): + registrations.append(emit_registration_helper(f)) + return registrations + + +def gen_functionalization_definition( + selector: SelectiveBuilder, + # Note: Ideally this code should never have to look at NativeFunction + # (and instead only need to operate on grouped NativeFunctions). + # The only reason currently is because we need to emit direct dispatch registrations + # For CompositeImplicitAutograd operators, which are potentially ungrouped. + g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup, +) -> list[str]: + # Don't generate kernels in mobile build + if not selector.include_all_operators: + return [] + + if isinstance(g, NativeFunctionsViewGroup): + # Case 1: emit view -> view_copy kernels for the functionalization pass + view_defs = [] + if not g.composite: + # invariant: NativeFunctionsViewGroup's always have a view_copy operator + # if the view is not composite (implicit autograd) + assert g.view_copy is not None, dataclass_repr(g, indent=1) + view_defs.append(emit_view_functionalization_body(g, view_inplace=False)) + if g.view_inplace is not None: + view_defs.append(emit_view_functionalization_body(g, view_inplace=True)) + return view_defs + elif isinstance(g, NativeFunction): + # Invariant: all mutable operators that we need to handle in functionalization + # should have been properly grouped up. + # TODO: The below ops all have "problematic" schemas that prevent them from + # getting functionalized. Instead of bending over backwards to get things to work, + # I think we should either: + # (1) fix their schemas (BC-breaking) + # (2) hand-write their functionalization kernels + if ( + str(g.func.name) not in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION + and str(g.func.name.name) not in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION + ): + assert g.has_composite_implicit_autograd_kernel or not modifies_arguments(g) + return [] + else: + # Case 2: emit inplace -> out-of-place kernels for the functionalization pass + mutation_defs = [] + mutation_defs.append(emit_inplace_functionalization_body(g.out, g)) + if g.inplace is not None: + mutation_defs.append(emit_inplace_functionalization_body(g.inplace, g)) + if g.mutable is not None: + mutation_defs.append(emit_inplace_functionalization_body(g.mutable, g)) + return mutation_defs + return [] diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py new file mode 100644 index 00000000000..884f645cc4b --- /dev/null +++ b/torchgen/gen_lazy_tensor.py @@ -0,0 +1,581 @@ +from __future__ import annotations + +import argparse +import os +from collections import namedtuple +from pathlib import Path +from typing import Any, Callable, Iterable, Iterator, Sequence + +import yaml + +import torchgen.dest as dest +from torchgen.api.lazy import setValueT +from torchgen.api.types import BaseCppType +from torchgen.dest.lazy_ir import GenLazyIR, GenLazyNativeFuncDefinition, GenTSLazyIR +from torchgen.gen import get_grouped_native_functions, parse_native_yaml +from torchgen.gen_backend_stubs import ( + error_on_missing_kernels, + gen_dispatcher_registrations, + gen_dispatchkey_nativefunc_headers, + parse_backend_yaml, +) +from torchgen.model import NativeFunction, NativeFunctionsGroup, OperatorName +from torchgen.selective_build.selector import SelectiveBuilder +from torchgen.utils import FileManager, NamespaceHelper +from torchgen.yaml_utils import YamlLoader + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Lazy Tensor Codegen +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# Overview +# ~~~~~~~~ +# +# This codegen script builds on existing data models and helpers used +# by all ATen backends, and adds new functionality specific to lazy +# tensor backends. +# +# Inputs: +# - _native_functions.yaml: controls which operators are +# supported by the backend. +# +# Outputs: +# (for all backends) +# Ir.h defines Lazy IR classes to be constructed during tracing +# - opt-in: also generate 'lowering' methods for the TorchScript backend only +# NativeFunctions.cpp defines implementations of native functions which perform lazy tracing +# - opt-in: 'full_codegen' section of backend yaml; 'supported' section omits these implementations +# NativeFunctions.h declares implementations of native functions for both 'supported' and 'full_codegen' +# ops +# +# Register.cpp registers all op implementations with the dispatcher +# RegisterAutograd.cpp registers all autograd implementations with the dispatcher +# +# Validation Helpers: +# - Shape Inference: errs if any ops in backend yaml require shape inference not provided by meta kernels or +# implementations in torch/csrc/lazy/core/shape_inference.* +# - native function impls: errs if any 'supported' ops do not have an implementation defined in the backend +# (non-codegen) implementation file +# +# +# About the Data Model +# ~~~~~~~~~~~~~~~~~~~~ +# +# Modeled after ATen codegen, the first step is to parse yaml and build a data model for the operators +# we care about. In this case, the _native_functions yaml defines a subset of the core operators +# (defined in more detail in the main native_functions.yaml), which will be supported by your backend. +# Backends can list ops in two categories: +# - `supported` ops require hand-implementations but still get codegenned declarations and registrations +# - `full_codegen` ops get implementations (and IR classes) generated too +# +# Each native function is modeled as an object with a schema, and each schema has objects representing their +# arguments. Much of the codegen is manipulation of the arguments and their types. For example, lazy tensor +# backends need to transform 'at::Tensor' arguments into 'lazy::Value' objects, as well as replacing reference +# types (stringref) with actual string objects, and this is done by manipulating the data model objects. +# - see api/lazy.py for the lazy data model +# +# Once the data model is set up, the rest of this script processes a number of templates for output CPP file +# and fills in the template values using helpers in `dest/lazy_ir.py` and `dest/lazy_ts_lowering.py`. These +# helpers mostly iterate over functions and their arguments, outputting different c++ snippets. +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + + +# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key. +# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping, full_codegen) +ParsedExternalYaml = namedtuple( + "ParsedExternalYaml", + ["backend_key", "autograd_key", "cpp_namespace", "backend_indices", "full_codegen"], +) + + +def parse_native_functions_keys( + backend_yaml_path: str, + grouped_native_functions: Sequence[NativeFunction | NativeFunctionsGroup], +) -> tuple[list[OperatorName], list[Any], list[OperatorName]]: + with open(backend_yaml_path) as f: + yaml_values = yaml.load(f, Loader=YamlLoader) + assert isinstance(yaml_values, dict) + + full_codegen = yaml_values.pop("full_codegen", []) + non_native = yaml_values.pop("non_native", []) + ir_gen = yaml_values.pop("ir_gen", []) + assert isinstance(full_codegen, list) + assert isinstance(non_native, list) + assert isinstance(ir_gen, list) + full_codegen_opnames = [OperatorName.parse(name) for name in full_codegen] + ir_gen_opnames = [OperatorName.parse(name) for name in ir_gen] + return full_codegen_opnames, non_native, ir_gen_opnames + + +def validate_shape_inference_header( + shape_inference_hdr: str, expected_shape_infr_decls: list[str] +) -> None: + try: + with open(shape_inference_hdr) as f: + shape_infr_decls = f.read() + shape_infr_decl_lines = set(shape_infr_decls.split("\n")) + except OSError as e: + raise AssertionError( + f"Unable to read from the specified shape_inference_hdr file: {shape_inference_hdr}" + ) from e + + # TODO(whc) add a check for shape inference functions that have meta kernels implement and should be retired. + + missing_decls = [ + decl for decl in expected_shape_infr_decls if decl not in shape_infr_decl_lines + ] + if missing_decls: + raise Exception( # noqa: TRY002 + f"""Missing shape inference function.\n +Please add declare this function in {shape_inference_hdr}:\n +and implement it in the corresponding shape_inference.cpp file.\n +{os.linesep.join(missing_decls)}""" + ) + + +# Some helper functions for the codegen. +def get_ltc_helper_fns() -> str: + return """\ +at::Tensor to_meta(const at::Tensor& tensor) { + // undefined tensors can't be converted to the meta device, since they don't have sizes/strides + if (!tensor.defined()) return tensor; + auto out = at::native::empty_strided_meta_symint(tensor.sym_sizes(), tensor.sym_strides(), \ +/*dtype=*/std::make_optional(tensor.scalar_type()), /*layout=*/std::make_optional(tensor.layout()), \ +/*device=*/std::make_optional(c10::Device(c10::kMeta)), /*pin_memory=*/std::nullopt); + // needs to handle wrapped numbers, so dtype promotion works properly. + if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) { + out.unsafeGetTensorImpl()->set_wrapped_number(true); + } + return out; +} +std::optional to_meta(const std::optional& tensor) { + if (tensor.has_value()) { + return to_meta(*tensor); + } + return std::nullopt; +} + +std::vector to_meta(at::ITensorListRef t_list) { + std::vector outs; + outs.reserve(t_list.size()); + for (const auto& tensor : t_list) { + outs.push_back(to_meta(tensor)); + } + return outs; +} +""" + + +class default_args: + node_base: str = "Node" + node_base_hdr: str | None = None + shape_inference_hdr: str = "torch/csrc/lazy/core/shape_inference.h" + tensor_class: str = "torch::lazy::LazyTensor" + tensor_class_hdr: str = "torch/csrc/lazy/core/tensor.h" + lazy_ir_generator: type[GenLazyIR] = GenLazyIR + native_func_definition_generator: type[ + GenLazyNativeFuncDefinition + ] = GenLazyNativeFuncDefinition + backend_name: str = "TorchScript" + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate Lazy Tensor backend files") + parser.add_argument( + "-s", + "--source-yaml", + "--source_yaml", + help="path to source yaml file containing operator external definitions", + ) + parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory") + parser.add_argument( + "--dry-run", "--dry_run", type=bool, default=False, help="output directory" + ) + parser.add_argument( + "--impl-path", + "--impl_path", + type=str, + default=None, + help="path to the source C++ file containing kernel definitions", + ) + parser.add_argument( + "--gen-ts-lowerings", + "--gen_ts_lowerings", + action="store_true", + help="Generate TorchScript lowerings in addition to Lazy IR and NativeFunctions", + ) + parser.add_argument( + "--node-base", + "--node_base", + type=str, + default=default_args.node_base, + help="Name of backend specific custom Lazy IR Node base class", + ) + parser.add_argument( + "--node-base-hdr", + "--node_base_hdr", + type=str, + default=default_args.node_base_hdr, + help="Path to header file defining custom Lazy IR Node base class", + ) + parser.add_argument( + "--shape-inference-hdr", + "--shape_inference_hdr", + type=str, + default=default_args.shape_inference_hdr, + help="Path to header file defining custom Lazy shape inference functions", + ) + parser.add_argument( + "--tensor-class", + "--tensor_class", + type=str, + default=default_args.tensor_class, + help="Name of backend specific custom Lazy Tensor class", + ) + parser.add_argument( + "--tensor-class-hdr", + "--tensor_class_hdr", + type=str, + default=default_args.tensor_class_hdr, + help="Path to header file defining custom Lazy Tensor class", + ) + parser.add_argument( + "--backend-name", + "--backend_name", + type=str, + default=default_args.backend_name, + help="Name of the backend to generate", + ) + options = parser.parse_args() + + # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py + torch_root = Path(__file__).parent.parent.parent.absolute() + aten_path = str(torch_root / "aten" / "src" / "ATen") + lazy_ir_generator: type[GenLazyIR] = default_args.lazy_ir_generator + if options.gen_ts_lowerings: + lazy_ir_generator = GenTSLazyIR + native_func_definition_generator: type[ + GenLazyNativeFuncDefinition + ] = default_args.native_func_definition_generator + + run_gen_lazy_tensor( + aten_path, + options.source_yaml, + options.output_dir, + options.dry_run, + options.impl_path, + options.node_base, + options.node_base_hdr, + options.tensor_class, + options.tensor_class_hdr, + options.shape_inference_hdr, + lazy_ir_generator, + native_func_definition_generator, + options.backend_name, + ) + + +def run_gen_lazy_tensor( + aten_path: str, + source_yaml: str, + output_dir: str, + dry_run: bool, + impl_path: str | None, + node_base: str = default_args.node_base, + node_base_hdr: str | None = default_args.node_base_hdr, + tensor_class: str = default_args.tensor_class, + tensor_class_hdr: str = default_args.tensor_class_hdr, + shape_inference_hdr: str = default_args.shape_inference_hdr, + lazy_ir_generator: type[GenLazyIR] = default_args.lazy_ir_generator, + native_func_definition_generator: type[ + GenLazyNativeFuncDefinition + ] = default_args.native_func_definition_generator, + # build_in_tree is true for TS backend and affects include paths + build_in_tree: bool = False, + # per_operator_headers changes whether ATen/Functions.h or individual operator headers are used + # it must match how ATen was built + per_operator_headers: bool = False, + backend_name: str = default_args.backend_name, + gen_forced_fallback_code: bool = False, + use_lazy_shape: bool = True, + # the following arguments are temporary customization points for xla backend migration. + # do not rely on them otherwise, they should be removed once migration is complete + backend_namespace: str = "torch::lazy", + get_tensorlist: str = "GetTensorList", + get_tensor_or_wrap_number: str = "GetLtcTensorOrCreateForWrappedNumber", + try_get_tensor: str = "TryGetLtcTensor", + metrics_counter: str = 'TORCH_LAZY_FN_COUNTER("lazy::")', + create_tensor: str = "LazyTensor::Create", + create_from_first_tensor: bool = False, + create_aten_from_ltc_tensor: str = "torch::lazy::CreateAtenFromLtcTensor", + tuple_aten_from_ltc_tensors: str = "torch::lazy::TupleAtenFromLtcTensors", + lazy_value_class: str = "torch::lazy::Value", + lazy_tensor_ptr: str = "LazyTensorPtr", + get_device_fn: str = "torch::lazy::GetBackendDevice", +) -> None: + lv_tokens = lazy_value_class.split("::") + lv_class = lv_tokens[-1] + lv_ns = "::".join(lv_tokens[:-1]) + setValueT(BaseCppType(lv_ns, lv_class)) + template_dir = os.path.join(aten_path, "templates") + + def make_file_manager(install_dir: str) -> FileManager: + return FileManager( + install_dir=install_dir, template_dir=template_dir, dry_run=dry_run + ) + + fm = make_file_manager(output_dir) + + native_yaml_path = os.path.join(aten_path, "native/native_functions.yaml") + tags_yaml_path = os.path.join(aten_path, "native/tags.yaml") + parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path) + native_functions, backend_indices = ( + parsed_yaml.native_functions, + parsed_yaml.backend_indices, + ) + grouped_native_functions = get_grouped_native_functions(native_functions) + + def sort_native_function(f: NativeFunctionsGroup | NativeFunction) -> str: + """ + We sort the native function because of the note in concat_map_codegen. + TODO(alanwaketan): Remove this sorting hack once all ops are grouped properly. + """ + func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func + return str(func.name.name) + + grouped_native_functions = sorted( + grouped_native_functions, key=sort_native_function + ) + + parsed_backend_yaml = parse_backend_yaml( + source_yaml, grouped_native_functions, backend_indices + ) + backend_key = parsed_backend_yaml.backend_key + autograd_key = parsed_backend_yaml.autograd_key + cpp_namespace = parsed_backend_yaml.cpp_namespace + backend_indices = parsed_backend_yaml.backend_indices + # the following 3 keys are all processed differently + # for full_codegen, we generate IR, kernels, etc + # for ir_gen, we generate only IR + # non_native is used to register kernels not declared in + # native_functions.yaml + full_codegen, non_native, ir_gen = parse_native_functions_keys( + source_yaml, grouped_native_functions + ) + + def concat_map_codegen( + func: Callable[[NativeFunction], Sequence[str]], + xs: Iterable[NativeFunctionsGroup | NativeFunction], + ops_list: list[OperatorName] = full_codegen, + ) -> Iterator[str]: + """ + We code-gen for the functional variant, which is all we need for IR classes/lowerings/shape inferences, but we + only code-gen additional entries for the inplace variant for the native functions. + """ + + for x in xs: + fs = list(x.functions()) if isinstance(x, NativeFunctionsGroup) else [x] + for f in fs: + if f.func.name in ops_list: + yield from func(f) + + selector = SelectiveBuilder.get_nop_selector() + + assert backend_key is not None + class_name = backend_indices[backend_key].native_function_class_name() + + if impl_path is not None: + error_on_missing_kernels( + native_functions, + backend_indices, + backend_key, + autograd_key, + class_name, + impl_path, + full_codegen, + ) + + """ Validate Shape Inference Definitions + + Generated lazy native functions all perform shape inference, by first using a meta:: kernel + if available for that op, and otherwise using a 'compute_shape_{op}' function instead. The generator + knows the call signature for compute_shape_{op} because it matches the nativefunction (and meta::) signature, + so it just has to check whether the op is structured and generate a call for one or the other. It's up to the dev + to supply the missing compute_shape_{op} function, but the codegen at least warns you about this and provides + the expected signature which can be copy-pasted into shape_inference.h. + + compute_shape_{op} functions are handwritten and should be replaced over time as ops get ported + to structured kernels. + + See torch/csrc/lazy/core/shape_inference.cpp #READ THIS! for more information. + """ + if shape_inference_hdr is not None: + expected_shape_infr_decls = list( + concat_map_codegen( + dest.GenLazyShapeInferenceDefinition( + backend_indices[backend_key], tensor_class + ), + grouped_native_functions, + ) + ) + + validate_shape_inference_header(shape_inference_hdr, expected_shape_infr_decls) + assert class_name is not None + + # Generate nativefunction declarations + # Note, eager registrations is set to False for the lazy TS backend as another LTC backend + # may want to register their own lazy kernels instead of registering the TS ones. + # The registration will lazily happen when init_ts_backend is called. + gen_dispatchkey_nativefunc_headers( + fm, + class_name, + cpp_namespace, + backend_indices, + grouped_native_functions, + backend_key, + autograd_key, + backend_name, + ) + + # Generate Dispatcher registrations which hook up the nativefunctions + for dispatch_key in ( + [backend_key] if autograd_key is None else [backend_key, autograd_key] + ): + gen_dispatcher_registrations( + fm, + output_dir, + class_name, + backend_indices, + grouped_native_functions, + backend_key, + dispatch_key, + selector, + build_in_tree=build_in_tree, + per_operator_headers=per_operator_headers, + backend_name=backend_name, + eager_registration=False, + ) + + # Generate native function impls that build IR nodes + ns_helper = NamespaceHelper(cpp_namespace) + fm.write_with_template( + f"{backend_key}NativeFunctions.cpp", + "DispatchKeyNativeFunctions.cpp", + lambda: { + "includes": [ + f"#include <{path}>" + for path in [ + tensor_class_hdr, + shape_inference_hdr, + "ATen/Functions.h", + "ATen/native/TensorConversions.h", + "ATen/NativeFunctions.h", + "ATen/CompositeExplicitAutogradNonFunctionalFunctions.h", + "ATen/MetaFunctions.h", + "ATen/Operators.h", + "ATen/native/CPUFallback.h", + "torch/csrc/lazy/core/ir_builder.h", + "torch/csrc/lazy/core/lazy_graph_executor.h", + "torch/csrc/lazy/core/metrics.h", + "torch/csrc/lazy/core/shape.h", + f"{output_dir}/{backend_key}NativeFunctions.h", + f"{output_dir}/LazyIr.h", + ] + + ( + ["torch/csrc/lazy/ts_backend/ts_eager_fallback.h"] + if gen_forced_fallback_code + else [] + ) + ], + "helper_fns": get_ltc_helper_fns(), + "native_functions_include": "", + "namespace_prologue": ns_helper.prologue, + "namespace_epilogue": ns_helper.epilogue, + "native_function_definitions": list( + concat_map_codegen( + native_func_definition_generator( + f"{backend_key}NativeFunctions", + backend_indices[backend_key], + tensor_class, + gen_forced_fallback_code, + backend_namespace, + get_tensorlist, + get_tensor_or_wrap_number, + try_get_tensor, + metrics_counter, + create_tensor, + create_from_first_tensor, + create_aten_from_ltc_tensor, + tuple_aten_from_ltc_tensors, + lazy_tensor_ptr, + get_device_fn, + ), + grouped_native_functions, + ) + ), + }, + ) + # Generate IR node classes + lazy_ir_obj = lazy_ir_generator( + backend_indices[backend_key], backend_name, node_base, use_lazy_shape + ) + + fm.write_with_template( + "LazyIr.h", + "LazyIr.h", + lambda: { + "lazy_ir_sysinc": [ + f"#include <{path}>" + for path in [ + "ATen/core/Formatting.h", + "c10/core/ScalarType.h", + "torch/csrc/lazy/core/hash.h", + "torch/csrc/lazy/core/ir.h", + "torch/csrc/lazy/core/shape.h", + "optional", + "vector", + ] + ], + "lazy_ir_inc": [f'#include "{node_base_hdr}"'] + if node_base_hdr is not None + else [], + "ir_declarations": list( + concat_map_codegen( + lazy_ir_obj, grouped_native_functions, full_codegen + ir_gen + ) + ), + "namespace_prologue": ns_helper.prologue, + "namespace_epilogue": ns_helper.epilogue, + }, + ) + + # Generate Non Native IR Node classes + fm.write_with_template( + "LazyNonNativeIr.h", + "LazyNonNativeIr.h", + lambda: { + "lazy_non_native_ir_inc": [ + f"#include <{path}>" + for path in [ + "torch/csrc/lazy/core/ir.h", + "torch/csrc/lazy/core/ir_builder.h", + "torch/csrc/lazy/core/internal_ops/ltc_ops.h", + "torch/csrc/lazy/core/shape_inference.h", + ] + + ([node_base_hdr] if node_base_hdr else []) + if path + ], + "non_native_ir_nodes": dest.generate_non_native_lazy_ir_nodes( + non_native, lazy_ir_obj + ), + "namespace_prologue": ns_helper.prologue, + "namespace_epilogue": ns_helper.epilogue, + }, + ) + + +if __name__ == "__main__": + main() diff --git a/torchgen/gen_schema_utils.py b/torchgen/gen_schema_utils.py new file mode 100644 index 00000000000..975fbee6df9 --- /dev/null +++ b/torchgen/gen_schema_utils.py @@ -0,0 +1,97 @@ +from typing import Any, Optional, Tuple, Union + +from torchgen.model import ( + Annotation, + Argument, + Arguments, + BaseOperatorName, + BaseTy, + BaseType, + CustomClassType, + FunctionSchema, + ListType, + OperatorName, + Return, +) + + +# Note: These aren't actually used in torchgen, they're some utilities for generating a schema +# from real arguments. For example, this is used to generate HigherOrderOperators' schema since +# their schemas can vary for different instances of the same HOP. + + +class TypeGen: + convert_to_base_ty = { + int: BaseTy.int, + float: BaseTy.float, + str: BaseTy.str, + bool: BaseTy.bool, + } + + @staticmethod + def from_example(obj: Any) -> Union[BaseType, ListType, CustomClassType]: + import torch + + if isinstance(obj, torch.fx.GraphModule): + return BaseType(BaseTy.GraphModule) + elif isinstance(obj, torch.Tensor): + return BaseType(BaseTy.Tensor) + elif isinstance(obj, torch.SymInt): + return BaseType(BaseTy.SymInt) + elif isinstance(obj, torch.SymBool): + return BaseType(BaseTy.SymBool) + elif isinstance(obj, torch.ScriptObject): + return CustomClassType(obj._type().name()) # type: ignore[attr-defined] + elif isinstance(obj, (list, tuple)): + assert len(obj) > 0 + all_base_tys = [TypeGen.from_example(x) for x in obj] + if len(set(all_base_tys)) > 1: + raise RuntimeError( + f"Cannot generate schema for a seqeunce of args of heterogeneous types: {all_base_tys}. " + "Consider unpacking the argument and give proper names to them if possible " + "instead of using *args." + ) + return ListType(all_base_tys[0], len(obj)) + tp = type(obj) + if tp not in TypeGen.convert_to_base_ty: + raise RuntimeError(f"unsupported type {tp}") + return BaseType(TypeGen.convert_to_base_ty[tp]) + + +class ReturnGen: + @staticmethod + def from_example( + name: Optional[str], obj: Any, annotation: Optional[Annotation] + ) -> Return: + return Return(name, TypeGen.from_example(obj), annotation) + + +class ArgumentGen: + @staticmethod + def from_example( + name: str, obj: Any, default: Optional[str], annotation: Optional[Annotation] + ) -> Argument: + return Argument( + name, TypeGen.from_example(obj), default=default, annotation=annotation + ) + + +class FunctionSchemaGen: + @staticmethod + def from_example( + op_name: str, + example_inputs: Tuple[Tuple[str, Any], ...], + example_outputs: Tuple[Any, ...], + ) -> FunctionSchema: + args = [] + for name, inp in example_inputs: + args.append(ArgumentGen.from_example(name, inp, None, None)) + # ignore the annotations and other attributes for now, we could add more when needed. + arguments = Arguments( + tuple(), None, tuple(args), tuple(), None, tuple(), tuple() + ) + returns = tuple( + ReturnGen.from_example(None, out, None) for out in example_outputs + ) + op_name = OperatorName(BaseOperatorName(op_name, False, False, False), "") + return FunctionSchema(op_name, arguments, returns) diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py new file mode 100644 index 00000000000..af9af6454eb --- /dev/null +++ b/torchgen/gen_vmap_plumbing.py @@ -0,0 +1,271 @@ +from __future__ import annotations + +import textwrap +from dataclasses import dataclass +from typing import Sequence + +from torchgen.api.translate import translate +from torchgen.api.types import DispatcherSignature +from torchgen.context import method_with_native_function +from torchgen.model import ( + Argument, + BaseTy, + BaseType, + FunctionSchema, + ListType, + NativeFunction, + OptionalType, + Return, + SchemaKind, + Type, +) +from torchgen.utils import mapMaybe + + +def is_tensor(typ: Type) -> bool: + return isinstance(typ, BaseType) and typ.name == BaseTy.Tensor + + +def is_optional_tensor(typ: Type) -> bool: + return isinstance(typ, OptionalType) and is_tensor(typ.elem) + + +def is_tensor_list(typ: Type) -> bool: + return isinstance(typ, ListType) and is_tensor(typ.elem) + + +def unwrap_tensor(name: str, cur_level_var: str) -> list[str]: + result = f"""\ + auto [{name}_value, {name}_bdim] = unwrapTensorAtLevel({name}, {cur_level_var});""" + return textwrap.dedent(result).split("\n") + + +def unwrap_optional_tensor(name: str, cur_level_var: str) -> list[str]: + result = f"""\ + std::optional {name}_value; + std::optional {name}_bdim; + if ({name}) {{ + std::tie({name}_value, {name}_bdim) = unwrapTensorAtLevel({name}.value(), {cur_level_var}); + }}""" + return textwrap.dedent(result).split("\n") + + +def gen_unwraps( + flat_arguments: Sequence[Argument], cur_level_var: str +) -> tuple[str, list[str]]: + arg_names = [a.name for a in flat_arguments] + arg_types = [a.type for a in flat_arguments] + + tensors = [name for typ, name in zip(arg_types, arg_names) if is_tensor(typ)] + optional_tensors = [ + name for typ, name in zip(arg_types, arg_names) if is_optional_tensor(typ) + ] + + unwraps = [] + for tensor in tensors: + unwraps += unwrap_tensor(tensor, cur_level_var) + + for opt_tensor in optional_tensors: + unwraps += unwrap_optional_tensor(opt_tensor, cur_level_var) + unwrap_code = "\n".join(unwraps) + + unwrapped_arg_list = [] + for arg in arg_names: + if arg in tensors or arg in optional_tensors: + unwrapped_arg_list += [f"{arg}_value", f"{arg}_bdim"] + else: + unwrapped_arg_list.append(arg) + return unwrap_code, unwrapped_arg_list + + +def gen_case_where_all_bdims_are_none( + outer_sig: DispatcherSignature, schema: FunctionSchema, cur_level_var: str +) -> str: + conditions = [] + flat_args = schema.arguments.flat_all + for arg in flat_args: + if not arg.type.is_tensor_like(): + continue + conditions.append(f"!isBatchedAtLevel({arg.name}, {cur_level_var})") + + sig = DispatcherSignature.from_schema(schema) + translated_args = ", ".join( + e.expr for e in translate(outer_sig.arguments(), sig.arguments()) + ) + return f"""\ +if ({' && '.join(conditions)}) {{ + return at::_ops::{sig.func.name.unambiguous_name()}::call({translated_args}); +}}""" + + +def gen_returns( + returns: tuple[Return, ...], cur_level_var: str, results_var: str +) -> str: + idx = 0 + wrapped_returns = [] + for ret in returns: + if is_tensor(ret.type): + wrapped_returns.append( + f"makeBatched(std::get<{idx}>({results_var}), std::get<{idx + 1}>({results_var}), {cur_level_var})" + ) + idx += 2 + elif is_tensor_list(ret.type): + wrapped_returns.append( + f"makeBatchedVector(std::get<{idx}>({results_var}), std::get<{idx+1}>({results_var}), {cur_level_var})" + ) + idx += 2 + else: + wrapped_returns.append(f"std::get<{idx}>({results_var})") + idx += 1 + if len(wrapped_returns) == 1: + result = f"return {wrapped_returns[0]};" + else: + result = f'return std::make_tuple({", ".join(wrapped_returns)});' + return result + + +def accepts_at_least_one_tensor_input(schema: FunctionSchema) -> bool: + return any(a.type.is_tensor_like() for a in schema.arguments.flat_all) + + +def is_mutated_arg(argument: Argument) -> bool: + return argument.annotation is not None and argument.annotation.is_write + + +def gen_vmap_inplace_plumbing(native_function: NativeFunction) -> str | None: + # Assumptions: + # - only one argument is being modified in-place + # - the argument that is being modified in-place is the first argument + # - all returns are either Tensor, tuple of Tensor, or TensorList + schema = native_function.func + sig = DispatcherSignature.from_schema(schema) + returns = schema.returns + + # Check assumptions. If these are invalid we return None + # and punt the work to handle them to the future. + assert schema.kind() == SchemaKind.inplace + if not is_mutated_arg(schema.arguments.flat_all[0]): + return None + if not len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) == 1: + return None + + # Only support cases where all returns are Tensors or vector + if len(returns) == 0: + return None + if not all(is_tensor(ret.type) or is_tensor_list(ret.type) for ret in returns): + return None + if not accepts_at_least_one_tensor_input(schema): + return None + + cur_level_var = "cur_level" + + unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var) + bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var) + + return f"""\ +template +{sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{ + c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + auto maybe_layer = maybeCurrentDynamicLayer(); + vmap_check_escaped(maybe_layer, "gen_vmap_inplace_plumbing"); + int64_t {cur_level_var} = maybe_layer->layerId(); +{textwrap.indent(bdims_all_none_case, " ")} +{textwrap.indent(unwraps, " ")} + batch_rule({', '.join(unwrapped_arg_list)}); + return {schema.arguments.flat_all[0].name}; +}}""" + + +def gen_vmap_plumbing_no_returns(native_function: NativeFunction) -> str: + schema = native_function.func + sig = DispatcherSignature.from_schema(schema) + cur_level_var = "cur_level" + + unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var) + bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var) + + return f"""\ +template +{sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{ + c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + auto maybe_layer = maybeCurrentDynamicLayer(); + vmap_check_escaped(maybe_layer, "gen_vmap_plumbing_no_returns"); + int64_t {cur_level_var} = maybe_layer->layerId(); +{textwrap.indent(bdims_all_none_case, " ")} +{textwrap.indent(unwraps, " ")} + batch_rule({', '.join(unwrapped_arg_list)}); +}}""" + + +def gen_vmap_plumbing(native_function: NativeFunction) -> str | None: + schema = native_function.func + sig = DispatcherSignature.from_schema(schema) + returns = schema.returns + + # Only support cases where all returns are Tensors or vector + if not accepts_at_least_one_tensor_input(schema): + return None + if len(returns) == 0: + return gen_vmap_plumbing_no_returns(native_function) + return_symint_overrides = [ + "_scaled_dot_product_flash_attention", + "_scaled_dot_product_cudnn_attention", + ] + if ( + not all(ret.type.is_tensor_like() for ret in returns) + and schema.name.unambiguous_name() not in return_symint_overrides + ): + return None + # in-place views need special handling + if "inplace_view" in native_function.tags: + return None + + if schema.kind() == SchemaKind.inplace: + return gen_vmap_inplace_plumbing(native_function) + + # Don't support these (mutable, out, scratch) + if schema.kind() != SchemaKind.functional: + return None + + results_var = "results" + cur_level_var = "cur_level" + + unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var) + bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var) + + wrapped_returns = gen_returns(returns, cur_level_var, results_var) + return f"""\ +template +{sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{ + c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + auto maybe_layer = maybeCurrentDynamicLayer(); + vmap_check_escaped(maybe_layer, "gen_vmap_plumbing"); + int64_t {cur_level_var} = maybe_layer->layerId(); +{textwrap.indent(bdims_all_none_case, " ")} +{textwrap.indent(unwraps, " ")} + auto {results_var} = batch_rule({', '.join(unwrapped_arg_list)}); + {wrapped_returns} +}}""" + + +@dataclass(frozen=True) +class ComputeBatchRulePlumbing: + @method_with_native_function + def __call__(self, f: NativeFunction) -> str | None: + result = gen_vmap_plumbing(f) + return result + + +def gen_all_vmap_plumbing(native_functions: Sequence[NativeFunction]) -> str: + body = "\n".join(list(mapMaybe(ComputeBatchRulePlumbing(), native_functions))) + return f""" +#pragma once +#include +#include + +namespace at {{ namespace functorch {{ + +{body} + +}}}} // namespace at::functorch +""" diff --git a/torchgen/local.py b/torchgen/local.py new file mode 100644 index 00000000000..7c687c3a799 --- /dev/null +++ b/torchgen/local.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import threading +from contextlib import contextmanager +from typing import Iterator + + +# Simple dynamic scoping implementation. The name "parametrize" comes +# from Racket. +# +# WARNING WARNING: LOOKING TO EDIT THIS FILE? Think carefully about +# why you need to add a toggle to the global behavior of code +# generation. The parameters here should really only be used +# for "temporary" situations, where we need to temporarily change +# the codegen in some cases because we cannot conveniently update +# all call sites, and are slated to be eliminated once all call +# sites are eliminated. If you don't have a plan for how to get there, +# DON'T add a new entry here. + + +class Locals(threading.local): + use_const_ref_for_mutable_tensors: bool | None = None + use_ilistref_for_tensor_lists: bool | None = None + + +_locals = Locals() + + +def use_const_ref_for_mutable_tensors() -> bool: + assert _locals.use_const_ref_for_mutable_tensors is not None, ( + "need to initialize local.use_const_ref_for_mutable_tensors with " + "local.parametrize" + ) + return _locals.use_const_ref_for_mutable_tensors + + +def use_ilistref_for_tensor_lists() -> bool: + assert _locals.use_ilistref_for_tensor_lists is not None, ( + "need to initialize local.use_ilistref_for_tensor_lists with " + "local.parametrize" + ) + return _locals.use_ilistref_for_tensor_lists + + +@contextmanager +def parametrize( + *, use_const_ref_for_mutable_tensors: bool, use_ilistref_for_tensor_lists: bool +) -> Iterator[None]: + old_use_const_ref_for_mutable_tensors = _locals.use_const_ref_for_mutable_tensors + old_use_ilistref_for_tensor_lists = _locals.use_ilistref_for_tensor_lists + try: + _locals.use_const_ref_for_mutable_tensors = use_const_ref_for_mutable_tensors + _locals.use_ilistref_for_tensor_lists = use_ilistref_for_tensor_lists + yield + finally: + _locals.use_const_ref_for_mutable_tensors = ( + old_use_const_ref_for_mutable_tensors + ) + _locals.use_ilistref_for_tensor_lists = old_use_ilistref_for_tensor_lists diff --git a/torchgen/model.py b/torchgen/model.py new file mode 100644 index 00000000000..95694934310 --- /dev/null +++ b/torchgen/model.py @@ -0,0 +1,2851 @@ +from __future__ import annotations + +import dataclasses +import itertools +import re +from dataclasses import dataclass +from enum import auto, Enum +from typing import Callable, Iterator, Sequence + +from torchgen.utils import assert_never, NamespaceHelper, OrderedSet + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# DATA MODEL +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Some general principles for our data model. +# +# - Stop using C++ data types as the internal data representation +# format. Instead, the internal data structures are centered +# around JIT schema representation. This avoid a big problem +# with the old codegen where we read in all the types from +# native_functions.yaml and then immediately had to retranslate +# them into C++ types. +# +# - More semantic data representation. Instead of representing +# everything as dicts and strings, we define dataclasses for +# every interesting entity the code generation has to deal with. +# These dataclasses have strong semantic invariants: for example, +# we generally require them to roundtrip losslessly into the +# form they were parsed from. These structures are immutable +# and you're expected to populate information once during +# construction. + + +# Represent a source location; used for better error reporting +@dataclass(frozen=True) +class Location: + file: str + line: int + + def __str__(self) -> str: + return f"{self.file}:{self.line}" + + +# Valid values of the 'variants' field in native_functions.yaml +class Variant(Enum): + function = auto() + method = auto() + + +# Default kernel namespace +DEFAULT_KERNEL_NAMESPACE = "at::native" + +# NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h +BACKEND_COMPONENTS = "CPU CUDA HIP XLA MTIA MPS IPU XPU HPU VE Lazy Meta PrivateUse1 PrivateUse2 PrivateUse3".split() +FUNCTIONALITY_KEYS = [ + "", + "Quantized", + "Sparse", + "SparseCsr", + "NestedTensor", + "Autograd", +] + +# This list guards dispatches that can be used in derivatives.yaml +# For now we omit AutogradFunctionality and AutogradOther +AUTOGRAD_KEYS = ["AutogradNestedTensor"] + [ + "Autograd" + component for component in BACKEND_COMPONENTS +] + +FRAGMENT_NAMESPACES = {"quantized", "quantized_decomposed"} + + +# This doesn't have to be in sync with the header, it only needs to contain +# entries that we actually use in the codegen or want pyi entries for +class DispatchKey(Enum): + Undefined = 0 + CatchAll = Undefined + + FPGA = auto() + MAIA = auto() + Vulkan = auto() + Metal = auto() + MKLDNN = auto() + OpenGL = auto() + OpenCL = auto() + IDEEP = auto() + CustomRNGKeyId = auto() + MkldnnCPU = auto() + Sparse = auto() + SparseCsr = auto() + NestedTensor = auto() + Dense = auto() + + PythonTLSSnapshot = auto() + PreDispatch = auto() + PythonDispatcher = auto() + Python = auto() + FuncTorchDynamicLayerBackMode = auto() + ZeroTensor = auto() + Conjugate = auto() + Negative = auto() + BackendSelect = auto() + Named = auto() + AutogradOther = auto() + AutogradFunctionality = auto() + AutogradNestedTensor = auto() + Tracer = auto() + Autocast = auto() + AutocastCPU = auto() + AutocastCUDA = auto() + Batched = auto() + VmapMode = auto() + FuncTorchGradWrapper = auto() + FuncTorchBatched = auto() + BatchedNestedTensor = auto() + FuncTorchVmapMode = auto() + FuncTorchDynamicLayerFrontMode = auto() + Functionalize = auto() + TESTING_ONLY_GenericWrapper = auto() + TESTING_ONLY_GenericMode = auto() + + ADInplaceOrView = auto() + Autograd = auto() + CompositeImplicitAutograd = auto() + CompositeImplicitAutogradNestedTensor = auto() + CompositeExplicitAutograd = auto() + CompositeExplicitAutogradNonFunctional = auto() + FuncTorchBatchedDecomposition = auto() + + # BEGIN autogenerated + CPU = auto() + CUDA = auto() + HIP = auto() + XLA = auto() + MTIA = auto() + MPS = auto() + IPU = auto() + XPU = auto() + HPU = auto() + VE = auto() + Lazy = auto() + Meta = auto() + PrivateUse1 = auto() + PrivateUse2 = auto() + PrivateUse3 = auto() + QuantizedCPU = auto() + QuantizedCUDA = auto() + QuantizedHIP = auto() + QuantizedXLA = auto() + QuantizedMTIA = auto() + QuantizedMPS = auto() + QuantizedIPU = auto() + QuantizedXPU = auto() + QuantizedHPU = auto() + QuantizedVE = auto() + QuantizedLazy = auto() + QuantizedMeta = auto() + QuantizedPrivateUse1 = auto() + QuantizedPrivateUse2 = auto() + QuantizedPrivateUse3 = auto() + SparseCPU = auto() + SparseCUDA = auto() + SparseHIP = auto() + SparseXLA = auto() + SparseMTIA = auto() + SparseMPS = auto() + SparseIPU = auto() + SparseXPU = auto() + SparseHPU = auto() + SparseVE = auto() + SparseLazy = auto() + SparseMeta = auto() + SparsePrivateUse1 = auto() + SparsePrivateUse2 = auto() + SparsePrivateUse3 = auto() + SparseCsrCPU = auto() + SparseCsrCUDA = auto() + SparseCsrHIP = auto() + SparseCsrXLA = auto() + SparseCsrMTIA = auto() + SparseCsrMPS = auto() + SparseCsrIPU = auto() + SparseCsrXPU = auto() + SparseCsrHPU = auto() + SparseCsrVE = auto() + SparseCsrLazy = auto() + SparseCsrMeta = auto() + SparseCsrPrivateUse1 = auto() + SparseCsrPrivateUse2 = auto() + SparseCsrPrivateUse3 = auto() + NestedTensorCPU = auto() + NestedTensorCUDA = auto() + NestedTensorHIP = auto() + NestedTensorXLA = auto() + NestedTensorMTIA = auto() + NestedTensorMPS = auto() + NestedTensorIPU = auto() + NestedTensorXPU = auto() + NestedTensorHPU = auto() + NestedTensorVE = auto() + NestedTensorLazy = auto() + NestedTensorMeta = auto() + NestedTensorPrivateUse1 = auto() + NestedTensorPrivateUse2 = auto() + NestedTensorPrivateUse3 = auto() + AutogradCPU = auto() + AutogradCUDA = auto() + AutogradHIP = auto() + AutogradXLA = auto() + AutogradMTIA = auto() + AutogradMPS = auto() + AutogradIPU = auto() + AutogradXPU = auto() + AutogradHPU = auto() + AutogradVE = auto() + AutogradLazy = auto() + AutogradMeta = auto() + AutogradPrivateUse1 = auto() + AutogradPrivateUse2 = auto() + AutogradPrivateUse3 = auto() + # END autogenerated + + def __str__(self) -> str: + return self.name + + def lower(self) -> str: + return str(self).lower() + + @staticmethod + def parse(value: str) -> DispatchKey: + for k, v in DispatchKey.__members__.items(): + if k == value: + return v + raise AssertionError(f"unknown dispatch key {value}") + + +class _TorchDispatchModeKey(Enum): + FAKE = auto() + PROXY = auto() + FUNCTIONAL = auto() + + +def codegen_per_backend_entries() -> str: + r = [] + for fk in FUNCTIONALITY_KEYS: + for bc in BACKEND_COMPONENTS: + r.append(f" {fk}{bc} = auto()") + return "\n".join(r) + + +for fk in FUNCTIONALITY_KEYS: + for bc in BACKEND_COMPONENTS: + if not hasattr(DispatchKey, fk + bc): + r = codegen_per_backend_entries() + print(r) + raise RuntimeError( + f"Missing {fk}{bc} from DispatchKey enum. Here is the autogenerated list we expect to have:\n\n{r}" + ) + + +STRUCTURED_DISPATCH_KEYS = { + DispatchKey.MPS, + DispatchKey.CUDA, + DispatchKey.CPU, + DispatchKey.XPU, +} +UFUNC_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU} + +# Set of supported dispatch keys +dispatch_keys = [ + DispatchKey.CPU, + DispatchKey.SparseCPU, + DispatchKey.SparseCsrCPU, + DispatchKey.MkldnnCPU, + DispatchKey.CUDA, + DispatchKey.MPS, + DispatchKey.XPU, + DispatchKey.SparseCUDA, + DispatchKey.SparseCsrCUDA, + DispatchKey.QuantizedCPU, + DispatchKey.QuantizedCUDA, + DispatchKey.CompositeImplicitAutograd, + DispatchKey.CompositeImplicitAutogradNestedTensor, + DispatchKey.CompositeExplicitAutograd, + DispatchKey.CompositeExplicitAutogradNonFunctional, + DispatchKey.NestedTensorCPU, + DispatchKey.NestedTensorCUDA, + # Meta is a magic key: it is automatically generated for structured + # kernels + DispatchKey.Meta, + DispatchKey.SparseMeta, + DispatchKey.SparseCsrMeta, + DispatchKey.QuantizedMeta, + DispatchKey.NestedTensorMeta, + DispatchKey.ZeroTensor, +] + + +# Dispatch keys that "support all backends". These codegen slightly differently +# then backend specific keys. +def is_generic_dispatch_key(dk: DispatchKey) -> bool: + return dk in { + DispatchKey.CompositeExplicitAutograd, + DispatchKey.CompositeExplicitAutogradNonFunctional, + DispatchKey.CompositeImplicitAutograd, + DispatchKey.CompositeImplicitAutogradNestedTensor, + } + + +# CUDA specific dispatch keys +def is_cuda_dispatch_key(dk: DispatchKey) -> bool: + return dk in { + DispatchKey.CUDA, + DispatchKey.QuantizedCUDA, + DispatchKey.SparseCUDA, + DispatchKey.SparseCsrCUDA, + DispatchKey.NestedTensorCUDA, + DispatchKey.AutogradCUDA, + } + + +# XPU specific dispatcy keys +def is_xpu_dispatch_key(dk: DispatchKey) -> bool: + return dk in { + DispatchKey.XPU, + DispatchKey.QuantizedXPU, + DispatchKey.SparseXPU, + DispatchKey.SparseCsrXPU, + DispatchKey.NestedTensorXPU, + DispatchKey.AutogradXPU, + } + + +# Structured kernel generation is only supported for certain key types; +# otherwise use old-style +def is_structured_dispatch_key(dk: DispatchKey) -> bool: + return dk in STRUCTURED_DISPATCH_KEYS + + +def is_ufunc_dispatch_key(dk: DispatchKey) -> bool: + # For now, ufunc dispatch keys coincide with structured keys + return dk in UFUNC_DISPATCH_KEYS + + +# This is oddly named ScalarType and not DType for symmetry with C++ +class ScalarType(Enum): + Byte = auto() + Char = auto() + Short = auto() + Int = auto() + Long = auto() + Half = auto() + Float = auto() + Double = auto() + ComplexHalf = auto() + ComplexFloat = auto() + ComplexDouble = auto() + Bool = auto() + BFloat16 = auto() + Float8_e5m2 = auto() + Float8_e5m2fnuz = auto() + Float8_e4m3fn = auto() + Float8_e4m3fnuz = auto() + + def __str__(self) -> str: + return self.name + + @staticmethod + def maybe_parse(value: str) -> ScalarType | None: + for k, v in ScalarType.__members__.items(): + if k == value: + return v + return None + + @staticmethod + def parse(value: str) -> ScalarType: + mb_r = ScalarType.maybe_parse(value) + assert mb_r is not None, f"unknown dtype {value}" + return mb_r + + @staticmethod + def parse_set(values: str) -> OrderedSet[ScalarType]: + dtypes: OrderedSet[ScalarType] = OrderedSet() + for value in values.split(", "): + if value in DTYPE_CLASSES: + dtypes.update(DTYPE_CLASSES[value]) + else: + dtypes.add(ScalarType.parse(value)) + return dtypes + + +DTYPE_CLASSES: dict[str, OrderedSet[ScalarType]] = {} +# NB: Integral doesn't include boolean +DTYPE_CLASSES["Integral"] = OrderedSet( + [ + ScalarType.Byte, + ScalarType.Char, + ScalarType.Int, + ScalarType.Long, + ScalarType.Short, + ] +) +# NB: Floating doesn't include low precision types +DTYPE_CLASSES["Floating"] = OrderedSet([ScalarType.Float, ScalarType.Double]) +DTYPE_CLASSES["Complex"] = OrderedSet( + [ScalarType.ComplexFloat, ScalarType.ComplexDouble] +) +DTYPE_CLASSES["All"] = DTYPE_CLASSES["Integral"] | DTYPE_CLASSES["Floating"] +DTYPE_CLASSES["AllAndComplex"] = DTYPE_CLASSES["All"] | DTYPE_CLASSES["Complex"] +DTYPE_CLASSES["FloatingAndComplex"] = ( + DTYPE_CLASSES["Floating"] | DTYPE_CLASSES["Complex"] +) + + +# Represents the valid entries for ufunc_inner_loop in native_functions.yaml. +# NB: if you add a new UfuncKey, you will teach torchgen.dest.ufunc how +# to process it. Most logic will ignore keys they don't understand, so your +# new key will get silently ignored until you hook in logic to deal with it. +class UfuncKey(Enum): + # These are low level keys that represent exactly one particular + # instantiation of the kernel produced by codegen + CUDAFunctor = auto() + CUDAFunctorOnOther = auto() + CUDAFunctorOnSelf = auto() + + CPUScalar = auto() + CPUVector = auto() + + # These are the ones users will usually specify, and + # implicitly "fill in" the low level keys + ScalarOnly = auto() # CUDA*, CPUScalar + Generic = auto() # CUDA*, CPU* + + def __str__(self) -> str: + return self.name + + @staticmethod + def parse(value: str) -> UfuncKey: + for k, v in UfuncKey.__members__.items(): + if k == value: + return v + raise AssertionError(f"unknown ufunc key {value}") + + +class DeviceCheckType(Enum): + NoCheck = 0 + ExactSame = 1 + + +class ViewSchemaKind(Enum): + aliasing = auto() + aliasing_inplace = auto() + non_aliasing = auto() + + +# The basic input to the code generation is native_functions.yaml. +# The name "native", BTW, comes from the distinction between native +# functions and legacy TH functions. The legacy TH functions are gone, +# but the "native" descriptor has stuck. +# +# NativeFunction models a single entry in native_functions.yaml. Its +# fields roughly correspond to what you would see in the YAML itself, +# but after canonicalization and parsing has occurred. +# +# You can see some of the overall design patterns for how we setup +# dataclasses in this class, but we will defer a complete discussion +# of this at FunctionSchema. +@dataclass(frozen=True) +class NativeFunction: + # The namespace for this operator. For example, if we have "at::add" + # then the namespace would be "at". This enables ops to be registered + # through the same DSL with a custom namespace. If not specified, the + # default namespace would be "at". + namespace: str + + # The function schema of the operator in question. This schema + # has been parsed; see FunctionSchema for more about its structure. + # (This type is quoted as we are forward referencing a type + # defined later in the file. I opted for this ordering of the + # classes for expository clarity.) + func: FunctionSchema + + # Whether or not to generate mutable tensor arguments like regular + # ones + use_const_ref_for_mutable_tensors: bool + + # Whether or not to omit automatic generation of a DeviceGuard + device_guard: bool + + # How to emit automatic generation of device check + device_check: DeviceCheckType + + # What python module to put the function in + python_module: str | None + + # TODO: figure out what this does + category_override: str | None + + # If no variants are specified in native_functions.yaml, this is + # assumed to be {'function'}. + variants: set[Variant] + + # Whether or not we should skip generating registrations for + # this kernel. This is a bit of a double-edged sword, as manual + # registrations don't participate in codegen-based selective build! + manual_kernel_registration: bool + + # Whether or not to skip generating TensorMethod/Functions bindings + # for this kernel. Technically, this doesn't actually skip generating + # the binding; instead, the binding gets generated to __dispatch_{funcname} + # so you can make use of the normal binding if you need it. + manual_cpp_binding: bool + + # The location in the YAML file were this native function entry was + # defined. This is for conveniently reporting error messages! + loc: Location + + # A list of operators that are expected to be auto-generated for this NativeFunction. + # Note: This list isn't actually directly used by the codegen to generate anything. + # Instead, the codegen figures out what operators to generate purely based off of + # function schema, and uses the autogen declarations to error check. + # We expect every NativeFunction that gets auto-generated be explicitly called out + # in native_functions.yaml + autogen: list[OperatorName] + + # If non-empty, this kernel is subject to ufunc codegen. + # Sorted by ufunc_key + ufunc_inner_loop: dict[UfuncKey, UfuncInnerLoop] + + # Whether or not this out functions is a "structured kernel". Structured + # kernels are defined a little differently from normal kernels; in + # particular, their shape checking logic is defined separately from + # the kernel. Only out functions can be structured; other functions + # delegate to the out function using the structured_delegate keyword. + # Every structured kernel must have at least an out and a functional + # variant. + structured: bool + + # Whether or not this non-out function is a structured kernel, defined + # in terms of the out kernel referenced by the string here. + structured_delegate: OperatorName | None + + # Only valid for structured kernels. Specifies alternative of what + # to inherit from when defining the meta class for the structured + # operator. This will usually be TensorIteratorBase. This also + # changes the semantics of set_output to call the parent class. + structured_inherits: str | None + + # Structured kernels can declare elements as "precomputed". These elements + # are returned by the meta function in one struct and passed to the impl + # function in lieu of certain kernel arguments that these precomputed + # elements supersede. Information about the names and types of these + # precomputed elements and how they correspond to kernel arguments is stored + # in this member, if applicable. + precomputed: Precompute | None + + # Argument names whose default should be excluded from the C++ interface. + # Intended for resolving overload ambiguities between signatures. + cpp_no_default_args: set[str] + + # Note [Abstract ATen methods] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # An abstract ATen method is one whose dispatch differs between + # types. These are implemented in derived types (with a + # standard (throwing) definition in Type). A concrete ATen + # method is one which has the same dispatch for all types; + # we just implement it in the base Type. This is exposed + # in Declarations.yaml via a field named 'abstract'. + is_abstract: bool + + # Whether or not the NativeFunction contains a backend-agnostic kernel + has_composite_implicit_autograd_kernel: bool + has_composite_implicit_autograd_nested_tensor_kernel: bool + has_composite_explicit_autograd_kernel: bool + has_composite_explicit_autograd_non_functional_kernel: bool + + # Tags are used to describe semantic information about (groups of) operators, + # That aren't easily inferrable directly from the operator's schema. + tags: set[str] + + # NB: The benefit of defining a dataclass is that we automatically get + # a constructor defined for all the fields we specify. No need + # to explicitly write it out. + + # We parse both the NativeFunction + backend-specific information about it, which it stored in a corresponding BackendIndex. + @staticmethod + def from_yaml( + ei: dict[str, object], + loc: Location, + valid_tags: set[str], + ignore_keys: set[DispatchKey] | None = None, + ) -> tuple[NativeFunction, dict[DispatchKey, dict[OperatorName, BackendMetadata]]]: + """ + Parse a NativeFunction from a dictionary as directly parsed + from native_functions.yaml + """ + e = ei.copy() + + funcs = e.pop("func") + assert isinstance(funcs, str), f"not a str: {funcs}" + # only support one level of namespace. E.g., aten::add + namespace_helper = NamespaceHelper.from_namespaced_entity( + namespaced_entity=funcs, max_level=1 + ) + namespace = namespace_helper.get_cpp_namespace(default="aten") + func = FunctionSchema.parse(namespace_helper.entity_name) + + cpp_no_default_args_list = e.pop("cpp_no_default_args", []) + assert isinstance(cpp_no_default_args_list, list) + cpp_no_default_args = set(cpp_no_default_args_list) + + use_const_ref_for_mutable_tensors = e.pop( + "use_const_ref_for_mutable_tensors", False + ) + assert isinstance(use_const_ref_for_mutable_tensors, bool) + + variants_s = e.pop("variants", "function") + assert isinstance(variants_s, str) + variants: set[Variant] = set() + for v in variants_s.split(", "): + if v == "function": + variants.add(Variant.function) + elif v == "method": + variants.add(Variant.method) + else: + raise AssertionError(f"illegal variant {v}") + + manual_kernel_registration = e.pop("manual_kernel_registration", False) + assert isinstance( + manual_kernel_registration, bool + ), f"not a bool: {manual_kernel_registration}" + + manual_cpp_binding = e.pop("manual_cpp_binding", False) + assert isinstance(manual_cpp_binding, bool), f"not a bool: {manual_cpp_binding}" + + device_guard = e.pop("device_guard", True) + assert isinstance(device_guard, bool), f"not a bool: {device_guard}" + + device_check_s = e.pop("device_check", None) + assert device_check_s is None or isinstance( + device_check_s, str + ), f"not a str: {device_check_s}" + assert ( + device_check_s is None or device_check_s in DeviceCheckType.__members__ + ), f"illegal device_check: {device_check_s}" + device_check: DeviceCheckType + if device_check_s is None: + device_check = DeviceCheckType.ExactSame + else: + device_check = DeviceCheckType[device_check_s] + + structured = e.pop("structured", False) + assert isinstance(structured, bool), f"not a bool: {structured}" + + structured_delegate_s = e.pop("structured_delegate", None) + assert structured_delegate_s is None or isinstance( + structured_delegate_s, str + ), f"not a str: {structured_delegate_s}" + assert structured_delegate_s is None or "::" not in structured_delegate_s, ( + "namespace is not supported in structured delegate," + " using the same namespace as the native function" + ) + structured_delegate: OperatorName | None = None + if structured_delegate_s is not None: + structured_delegate = OperatorName.parse(structured_delegate_s) + + structured_inherits = e.pop("structured_inherits", None) + assert structured_inherits is None or isinstance( + structured_inherits, str + ), f"not a str: {structured_inherits}" + assert structured_inherits is None or "::" not in structured_inherits, ( + "namespace is not supported in structured inherits," + " using the same namespace as the native function" + ) + + python_module = e.pop("python_module", None) + assert python_module is None or isinstance( + python_module, str + ), f"not a str: {python_module}" + assert ( + python_module is None or Variant.method not in variants + ), "functions in modules cannot be methods" + + category_override = e.pop("category_override", None) + assert category_override is None or isinstance( + category_override, str + ), f"not a str: {category_override}" + + precomputed_dict = e.pop("precomputed", None) + assert precomputed_dict is None or structured is True + precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None + + tags_inp = e.pop("tags", []) + if isinstance(tags_inp, str): + tags_inp = [tags_inp] + assert isinstance(tags_inp, list) + + # All aten ops generated by torchgen receive the pt2_compliant tag. + if namespace == "aten" and "pt2_compliant_tag" in valid_tags: + tags_inp.append("pt2_compliant_tag") + + tags: set[str] = set() + for t in tags_inp: + assert len(valid_tags) > 0 + # TODO: verify that the tag is valid and has an entry in tags.yaml + if t in valid_tags: + tags.add(t) + else: + raise AssertionError(f"illegal tag {t}") + + from torchgen.api import cpp + + raw_dispatch = e.pop("dispatch", None) + assert raw_dispatch is None or isinstance(raw_dispatch, dict), e + dispatch: dict[DispatchKey, BackendMetadata] = {} + num_dispatch_keys: int = 0 + if raw_dispatch is not None: + assert not manual_kernel_registration, ( + "cannot specify both manual_kernel_registration and dispatch; with " + "manual registration, dispatch has no effect!" + ) + redundant_composite_implicit_autograd = False + for ks, v in raw_dispatch.items(): + if ks == "__line__": + continue # not worth tracking line numbers for dispatch entries + assert isinstance( + ks, str + ), f"illegal dispatch key '{ks}' in {raw_dispatch}" + assert isinstance( + v, str + ), f"illegal dispatch value '{v}' in {raw_dispatch}" + for k in ks.split(","): + dispatch_key = DispatchKey.parse(k.strip()) + num_dispatch_keys += 1 + + if ignore_keys and dispatch_key in ignore_keys: + continue + assert dispatch_key in dispatch_keys, ( + f"Dispatch key {dispatch_key} of kernel {v} " + "is not a supported dispatch key." + ) + # We only allow at most 3 levels of namespace for kernels. + # We will append "native" to a custom kernel namespace. + namespace_helper = NamespaceHelper.from_namespaced_entity( + v, max_level=3 + ) + kernel_namespace = namespace_helper.get_cpp_namespace(default="at") + # Why is 'structured' included? External backends (e.g. + # XLA) opt into which ops are structured independently + # of which in-tree ops are structured + dispatch[dispatch_key] = BackendMetadata( + kernel=namespace_helper.entity_name, + structured=structured + and is_structured_dispatch_key(dispatch_key), + cpp_namespace=(kernel_namespace + "::native"), + ) + if ( + dispatch_key is DispatchKey.CompositeImplicitAutograd + and v == cpp.name(func) + ): + redundant_composite_implicit_autograd = True + + # We count the number of dispatch keys which have not been ignored to prevent a dispatch table + # in which all backend keys are ignored but necessarily kept, remaining compositeimplicit, + # from being treated as redundant. + assert not ( + num_dispatch_keys == 1 and redundant_composite_implicit_autograd + ), ( + "unnecessary dispatch table for this function; just delete the dispatch " + "key entirely" + ) + # if a function is a structured delegate, deleting the dispatch + # table is NOT semantics preserving + assert ( + structured_delegate + or dispatch.keys() != {DispatchKey.CompositeImplicitAutograd} + or dispatch[DispatchKey.CompositeImplicitAutograd].supports_symint() + or num_dispatch_keys != 1 + ), ( + f"unexpected name for singleton CompositeImplicitAutograd dispatch entry: expected {cpp.name(func)} " + f"but got {dispatch[DispatchKey.CompositeImplicitAutograd]}. Rename your implementation to the expected " + "name, then delete the dispatch table" + ) + elif not structured and structured_delegate is None: + name = str(func.name.name) + assert not ( + name.startswith("new_") + or name.endswith("_like") + # TODO: maybe it's better to test the return + or ( + func.arguments.tensor_options + and not func.arguments.has_tensor_arg() + ) + ), ( + f"expected {name} to have a CompositeExplicitAutograd " + "dispatch entry, but there was no dispatch table. Factory functions " + "should not have implicit dispatch as they should not be decomposed " + "for __torch_dispatch__" + ) + dispatch[DispatchKey.CompositeImplicitAutograd] = BackendMetadata( + cpp.name(func), structured=False, cpp_namespace=DEFAULT_KERNEL_NAMESPACE + ) + + composites_in_dispatch = [ + d + for d in dispatch + if d == DispatchKey.CompositeExplicitAutograd + or d == DispatchKey.CompositeExplicitAutogradNonFunctional + or d == DispatchKey.CompositeImplicitAutograd + or d == DispatchKey.CompositeImplicitAutogradNestedTensor + ] + + assert len(composites_in_dispatch) <= 1 or ( + len(composites_in_dispatch) == 2 + and ( + DispatchKey.CompositeExplicitAutogradNonFunctional + not in composites_in_dispatch + ) + and ( + DispatchKey.CompositeImplicitAutogradNestedTensor + in composites_in_dispatch + ) + ), ( + "cannot specify more than one of CompositeExplicitAutograd, CompositeExplicitAutogradNonFunctional, " + "or CompositeImplicitAutograd on a single kernel; each " + "strictly subsumes the other. If you wanted to provide an explicit autograd " + "implementation, specify CompositeExplicitAutograd; otherwise specify CompositeImplicitAutograd only" + ) + + autogen_str = e.pop("autogen", "") + assert isinstance(autogen_str, str) + autogen = ( + [] + if autogen_str == "" + else [OperatorName.parse(x) for x in autogen_str.split(", ")] + ) + + raw_ufunc_inner_loop = e.pop("ufunc_inner_loop", {}) + ufunc_inner_loop = {} + if isinstance(raw_ufunc_inner_loop, str): + ufunc_inner_loop[UfuncKey.Generic] = UfuncInnerLoop.parse( + raw_ufunc_inner_loop, UfuncKey.Generic + ) + elif isinstance(raw_ufunc_inner_loop, dict): + for k, vo in raw_ufunc_inner_loop.items(): + if k == "__line__": + continue + assert isinstance(k, str), f"ufunc_inner_loop key is not a str: {k}" + assert isinstance(vo, str), f"ufunc_inner_loop value is not a str: {v}" + ufunc_key = UfuncKey.parse(k) + ufunc_inner_loop[ufunc_key] = UfuncInnerLoop.parse(vo, ufunc_key) + else: + raise AssertionError( + f"ufunc_inner_loop not str or dict: {raw_ufunc_inner_loop}" + ) + # Program the BackendIndex for the implicit dispatch entry from ufunc + if ufunc_inner_loop: + assert structured, "ufunc must be structured" + + # Delay import ufunc here to avoid circular import issue + # See: https://github.com/pytorch/pytorch/issues/81294 + import torchgen.api.ufunc as ufunc + + for dispatch_key in UFUNC_DISPATCH_KEYS: + assert ( + dispatch_key not in dispatch + ), f"ufunc should not have explicit dispatch entry for {dispatch_key}" + dispatch[dispatch_key] = BackendMetadata( + kernel=ufunc.schema_kernel_name(func, dispatch_key), + structured=True, + cpp_namespace=DEFAULT_KERNEL_NAMESPACE, + ) + + if structured_delegate: + # Structured functions MUST have a dispatch table + is_abstract = True + else: + is_abstract = ( + dispatch.keys() != {DispatchKey.CompositeImplicitAutograd} + and dispatch.keys() + != {DispatchKey.CompositeImplicitAutogradNestedTensor} + and dispatch.keys() + != { + DispatchKey.CompositeImplicitAutograd, + DispatchKey.CompositeImplicitAutogradNestedTensor, + } + ) + + has_composite_implicit_autograd_kernel = ( + DispatchKey.CompositeImplicitAutograd in dispatch + ) + has_composite_implicit_autograd_nested_tensor_kernel = ( + DispatchKey.CompositeImplicitAutogradNestedTensor in dispatch + ) + has_composite_explicit_autograd_kernel = ( + DispatchKey.CompositeExplicitAutograd in dispatch + ) + has_composite_explicit_autograd_non_functional_kernel = ( + DispatchKey.CompositeExplicitAutogradNonFunctional in dispatch + ) + + # We aren't going to store dispatch metadata inline in NativeFunctions; + # instead it is separately indexed by backend (so other backends can + # add more dispatch entries after the fact). Reindex the individual + # metadata by OperatorName! + backend_metadata = {k: {func.name: v} for k, v in dispatch.items()} + + # don't care if it exists or not; make it easier to use this function + # with other yaml parsers that aren't setting __line__ in the dict + e.pop("__line__", None) + assert not e, f"leftover entries: {e}" + + # Asserts that we can't do in post_init, because they rely on backend-specific info + if structured_delegate is not None: + for key in STRUCTURED_DISPATCH_KEYS: + assert key not in dispatch, ( + f"if structured_delegate, then must not have {key} in dispatch dictionary " + "(it is delegated!)" + ) + + return ( + NativeFunction( + func=func, + use_const_ref_for_mutable_tensors=use_const_ref_for_mutable_tensors, + variants=variants, + structured=structured, + structured_delegate=structured_delegate, + structured_inherits=structured_inherits, + precomputed=precomputed, + autogen=autogen, + ufunc_inner_loop=ufunc_inner_loop, + manual_kernel_registration=manual_kernel_registration, + manual_cpp_binding=manual_cpp_binding, + python_module=python_module, + category_override=category_override, + device_guard=device_guard, + device_check=device_check, + loc=loc, + cpp_no_default_args=cpp_no_default_args, + is_abstract=is_abstract, + has_composite_implicit_autograd_kernel=has_composite_implicit_autograd_kernel, + has_composite_implicit_autograd_nested_tensor_kernel=has_composite_implicit_autograd_nested_tensor_kernel, + has_composite_explicit_autograd_kernel=has_composite_explicit_autograd_kernel, + has_composite_explicit_autograd_non_functional_kernel=has_composite_explicit_autograd_non_functional_kernel, + tags=tags, + namespace=namespace, + ), + backend_metadata, + ) + + def validate_unstructured(self) -> None: + # TODO: probably better to accumulate these errors and report them all + # at once + assert not self.structured, ( + "This function is structured, but there was " + "no valid functional variant of it." + ) + assert self.structured_delegate, ( + "This function delegates to another structured out function, " + "but no valid function was found (the delegate may not exist, or it has the wrong type)" + ) + + # __post_init__ functions in dataclasses can be used to do extra + # validation after construction. + # + # Notice that we don't do any type validation here. In fact, we + # rely exclusively on mypy to check if you've done types correctly! + # Validation is for nontrivial invariants that cannot be (conveniently) + # encoded in the type system. + def __post_init__(self) -> None: + if self.func.arguments.out: + assert self.variants == {Variant.function}, ( + "Native functions with out arguments MUST " + "be declared with only function variant; e.g., variants: function; " + "otherwise you will tickle a Python argument binding bug " + "(which usually manifests itself as the result variable being undefined.)" + ) + if self.structured: + assert self.func.kind() == SchemaKind.out, ( + "Put structured field on the out= " + "variant of a function; did you mean structured_delegate?" + ) + assert ( + self.device_guard + ), "device_guard: False is not respected by structured kernels" + if self.structured_delegate: + assert self.func.kind() != SchemaKind.out, ( + "structured_delegate field not allowed " + "on out= functions; did you mean structured?" + ) + assert ( + self.device_guard + ), "device_guard: False is not respected by structured kernels" + # Technically, with the asserts above, this assert is impossible to + # happen + assert not ( + self.structured and self.structured_delegate + ), "Cannot have both structured and structured_delegate on function" + defaulted_arguments = { + a.name for a in self.func.schema_order_arguments() if a.default is not None + } + invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments) + assert len(invalid_args) == 0, f"Invalid cpp_no_default_args: {invalid_args}" + if self.structured_inherits is not None: + assert ( + self.structured + ), "structured_inherits must also imply structured: True" + if str(self.func.name).startswith("_foreach"): + assert self.device_check == DeviceCheckType.NoCheck, ( + "foreach kernels fall back to slow path when tensor are on different devices, " + "device_check not allowed to be enabled" + ) + + # NB: if your function accidentally has rand/dropout/... in its name + # but is not actually random, feel free to amend this to special case + if ( + "rand" in str(self.func.name) + or ( + ( + "dropout" in str(self.func.name) + or any( + "dropout" in arg.name for arg in self.func.arguments.flat_all + ) + ) + # Backwards of dropout is typically deterministic + and "backward" not in str(self.func.name) + and str(self.func.name.name) not in ["_cudnn_init_dropout_state"] + ) + or self.func.arguments.has_generator_arg() + ): + assert "nondeterministic_seeded" in self.tags, str(self.func.name) + + @property + def has_composite_kernel(self) -> bool: + return ( + self.has_composite_implicit_autograd_kernel + or self.has_composite_explicit_autograd_kernel + or self.has_composite_explicit_autograd_non_functional_kernel + ) or ( + self.has_composite_implicit_autograd_kernel + and self.has_composite_implicit_autograd_nested_tensor_kernel + ) + + @property + def is_view_op(self) -> bool: + rets = self.func.returns + is_non_mutating_view = len(rets) > 0 and any( + r.annotation is not None and not r.annotation.is_write for r in rets + ) + # See Note [resize_ in Functionalization] for more dtails + is_inplace_view = ( + "inplace_view" in self.tags + and str(self.func.name) != "resize_" + and str(self.func.name) != "resize_as_" + ) + is_wildcard_view = any( + inp.annotation is not None and "*" in inp.annotation.alias_set_after + for inp in self.func.schema_order_arguments() + ) + return is_non_mutating_view or is_inplace_view or is_wildcard_view + + @property + def view_schema_kind(self) -> ViewSchemaKind: + if self.is_view_op and self.func.name.name.inplace: + assert "inplace_view" in self.tags + return ViewSchemaKind.aliasing_inplace + if self.is_view_op: + return ViewSchemaKind.aliasing + else: + return ViewSchemaKind.non_aliasing + + @property + def root_name(self) -> str: + return self.func.name.name.base + + @property + def part_of_structured_group(self) -> bool: + return self.structured or self.structured_delegate is not None + + +class SchemaKind(Enum): + functional = auto() + inplace = auto() + out = auto() + mutable = auto() + scratch = auto() + + +# A structured kernel is guaranteed to have a functional and out variant, and +# optionally an inplace variant. +# +# NB: we create NativeFunctionsGroup *even if* the function is not +# actually annotated structured. Test the structured boolean to see if it +# actually is structured or not. +@dataclass(frozen=True) +class NativeFunctionsGroup: + functional: NativeFunction + inplace: NativeFunction | None + mutable: NativeFunction | None + out: NativeFunction + + @property + def structured(self) -> bool: + # Whether or not the operator has a meta() function. This information is backend-agnostic. + return self.out.structured + + def __post_init__(self) -> None: + test_sig: FunctionSchema = self.functional.func.signature() + for f in self.functions(): + if test_sig != f.func.signature(): + raise AssertionError( + "NativeFunctionsGroup constructed from two NativeFunctions " + f"that don't have matching signatures: {test_sig} != {f.func.signature()}" + ) + + if self.structured != f.part_of_structured_group: + raise AssertionError( + "NativeFunctionsGroup constructed from structured and unstructured " + f"functions: {self.out.func.name} and {f.func.name}" + ) + assert self.functional.func.kind() == SchemaKind.functional + assert self.out.func.kind() == SchemaKind.out + assert self.functional.namespace == self.out.namespace + if self.inplace is not None: + assert self.inplace.func.kind() == SchemaKind.inplace + assert self.inplace.namespace == self.functional.namespace + + if self.mutable is not None: + assert self.mutable.func.kind() == SchemaKind.mutable + assert self.mutable.namespace == self.functional.namespace + # See Note [Overload Ambiguity With Functional Variants] + assert self.functional.func.name.name.functional_overload + + if self.structured: + # For now, structured composite kernels are not supported (need some + # design work to figure out how to make the composite case work) + assert ( + not self.out.has_composite_implicit_autograd_kernel + and not self.out.has_composite_implicit_autograd_nested_tensor_kernel + ) + + assert self.functional.structured_delegate == self.out.func.name, ( + f"{self.functional.func.name} delegates to {self.functional.structured_delegate} " + f"but its actual delegate is {self.out.func.name}" + ) + if self.inplace is not None: + assert self.inplace.structured_delegate == self.out.func.name + + generated_fns = sorted( + [str(f.func.name) for f in self.functions() if "generated" in f.tags] + ) + generated_fns_str = ", ".join(str(x) for x in generated_fns) + expected_generated_fns: set[str] = set() + for f in self.functions(): + expected_generated_fns.update(str(op) for op in f.autogen) + expected_generated_fns_str = ", ".join( + str(x) for x in sorted(expected_generated_fns) + ) + if len(expected_generated_fns) == 0 and len(generated_fns) > 0: + raise RuntimeError( + f"The codegen expects to be able to generate '{generated_fns_str}'." + " In order to generate them however, we expect them to be called out explicitly in the yaml." + f" Please add an 'autogen: {generated_fns_str}' line to the entry for {str(f.func.name)}" + ) + if expected_generated_fns_str != generated_fns_str: + raise RuntimeError( + f"The codegen expects to be able to generate '{generated_fns_str}'." + f" To do so, it expects a line: 'autogen: {generated_fns_str}'." + f" Instead, it found 'autogen: {expected_generated_fns_str}'" + ) + + def signature(self) -> FunctionSchema: + return self.out.func.signature() + + def functions(self) -> Iterator[NativeFunction]: + yield self.functional + yield self.out + if self.inplace is not None: + yield self.inplace + if self.mutable is not None: + yield self.mutable + + @property + def root_name(self) -> str: + return self.functional.root_name + + @staticmethod + def from_dict(d: dict[SchemaKind, NativeFunction]) -> NativeFunctionsGroup | None: + assert d + if len(d) == 1: + return None + d = dict(d) # non-destructive updates please + functional = d.pop(SchemaKind.functional, None) + inplace = d.pop(SchemaKind.inplace, None) + mutable = d.pop(SchemaKind.mutable, None) + out = d.pop(SchemaKind.out, None) + assert not d + assert functional is not None + # There are a few operators which only have functional/inplace variants; + # these don't count as structured for our purposes here + if out is None: + return None + # assuming all variants have the same namespace + return NativeFunctionsGroup( + functional=functional, + inplace=inplace, + mutable=mutable, + out=out, + ) + + +@dataclass(frozen=True) +class BackendMetadata: + # The name of the backend kernel, for a given operator + # for in-tree backends. These names come directly from the 'dispatch" field + # in native_functions.yaml. The dispatch entry is optional; in that + # case, that is equivalent to having written: + # + # dispatch: + # CompositeImplicitAutograd: $operator_name + kernel: str + # Whether or not the operator has a structured kernel implemented, for this particular backend. + # For in-tree backends, they all have the same value for structured- this is listed + # in native_functions.yaml. + # However, external backends like XLA can indendently toggle which ops are structured. + structured: bool + + # The namespace for kernels, default value: DEFAULT_KERNEL_NAMESPACE + cpp_namespace: str + + def supports_symint(self) -> bool: + return "_symint" in self.kernel + + +@dataclass(frozen=True) +class UfuncInnerLoop: + name: str + supported_dtypes: OrderedSet[ScalarType] + # key is stored here because it affects the semantics of name, + # so its helpful to have them together for further processing + ufunc_key: UfuncKey + + @staticmethod + def parse(value: str, ufunc_key: UfuncKey) -> UfuncInnerLoop: + name, supported_dtypes_str = value.split(" ", 1) + assert supported_dtypes_str[0] == "(" + assert supported_dtypes_str[-1] == ")" + supported_dtypes: OrderedSet[ScalarType] = OrderedSet() + for k in supported_dtypes_str[1:-1].split(", "): + supported_dtypes |= ScalarType.parse_set(k) + return UfuncInnerLoop( + name=name, supported_dtypes=supported_dtypes, ufunc_key=ufunc_key + ) + + +# BackendIndex represents a backend. +# The BackendIndex encodes per-operator information that is potentially different +# for each backend. The most obvious example is the name of the kernel +# (the 'dispatch' entry in native_functions.yaml). +# However, there can be other examples of different backends having different information. +# External backends can choose to opt their kernels to be structured independently from in-tree backends, +# which means that this information isn't inherently tied to a NativeFunction- it's different per backend. +@dataclass(frozen=True) +class BackendIndex: + dispatch_key: DispatchKey + # Mainly important for structured kernels, this determines which variant in the operator group is used to implement the others. + # All in-tree ops use out kernels, while XLA uses functional kernels. + use_out_as_primary: bool + # Whether the backend requires a device guard, and device checks. + # For in-tree backends, this is currently just CUDA/HIP + # For out-of-tree backends, this is currently just Intel XPU + device_guard: bool + # Whether the backend is in-tree (CPU/CUDA) or out-of-tree (XLA) + external: bool + # Other backend-specific information that is on a per-operator basis + index: dict[OperatorName, BackendMetadata] + + @staticmethod + def grow_index( + parent_index: dict[DispatchKey, dict[OperatorName, BackendMetadata]], + child_index: dict[DispatchKey, dict[OperatorName, BackendMetadata]], + ) -> None: + for k, v in child_index.items(): + for op_name, metadata in v.items(): + assert ( + op_name not in parent_index[k] + ), f"duplicate operator {op_name} for dispatch key {k}" + parent_index[k][op_name] = metadata + + def primary(self, g: NativeFunctionsGroup) -> NativeFunction: + if self.use_out_as_primary: + return g.out + else: + return g.functional + + def has_kernel(self, g: NativeFunction | NativeFunctionsGroup) -> bool: + m = self.get_kernel(g) + return m is not None + + def get_kernel( + self, g: NativeFunction | NativeFunctionsGroup + ) -> BackendMetadata | None: + if isinstance(g, NativeFunction): + f = g + elif isinstance(g, NativeFunctionsGroup): + f = self.primary(g) + else: + assert_never(g) + if f.func.name not in self.index: + return None + return self.index[f.func.name] + + def native_function_class_name(self) -> str | None: + if self.external: + return f"{str(self.dispatch_key)}NativeFunctions" + else: + # TODO: This discrepancy isn't required; we could also generated + # a class for in-tree kernels. It'll just require carefully + # updating every kernel definition + callsite of every in-tree aten kernel. + return None + + +# The function schema is undoubtedly the most important data structure +# in all of the codegen, as it defines the type signature for operators, +# and most of the code generation we do is type directed (e.g., look at +# the types, decide what to do. Think about how we code generate +# C++ function stubs!) +# +# We will also see in this class the general structure for how we model +# data in this code generation. A few notable properties to point out +# ahead of time: +# +# - These dataclasses are a *lossless* representation of the strings +# they are parsed from. In fact, we assert that given the +# information stored in the dataclass, we can exactly reconstruct +# the string we parsed from (and assert this inside the parse +# definition). There are a few reasons for this: +# +# - If you find that it is difficult to reconstruct the string +# given a dataclass, that is a clue that you are data +# representation is wrong. +# +# - It helps ensure that all relevant information is present +# in the dataclass, so that downstream users aren't tempted +# to reparse the original string to get some information +# that was omitted. +# +# - It forces you to represent the data in-memory in the same way +# it is recorded textually, which makes the dataclasses easier +# to understand for someone who is familiar with the +# textual format. (As a tradeoff, it means you have to model +# the syntax, even when it is inconvenient. But maybe that means +# the syntax is bad!) If you don't understand the internal +# representation, go look at the printing code to see how +# it maps onto the surface syntax! +# +# - It makes it easy to test the parsing code, as parsing code +# that is inconsistent with the string code will fail early +# and loudly. (As a tradeoff, it makes the parsing code a bit +# brittle (in particular, with trivial whitespace changes you +# are likely to trigger an assert error). +# +# In general, try to make the __str__ code as simple as possible +# (even at the cost of more complex parsing logic.) Additionally, +# try to minimize redundancy in data representation. (Precomputed +# fields are OK though: they are defined as a simple function on +# the canonical representation in question.) +# +# - These dataclasses are all frozen; once constructed their +# values never change. This makes it easy to tell where any +# given data came from: just look to the constructor. As a +# tradeoff, you can't easily "decorate" a schema with extra +# information from a post-facto analysis. We impose this +# restriction to make these structures more understandable. +# +@dataclass(frozen=True) +class FunctionSchema: + # The name of the operator this function schema describes. + name: OperatorName + + arguments: Arguments + + # TODO: Need to handle collisions with argument names at some point + returns: tuple[Return, ...] + + @property + def is_mutable(self) -> bool: + def is_write(arg: Argument) -> bool: + if arg.annotation is None: + return False + return arg.annotation.is_write + + # Corresponds to torch._C._FunctionSchema.is_mutable + # See aten/src/ATen/core/function_schema.h (keep these in sync) + return any(is_write(a) for a in self.arguments.flat_all) + + def schema_order_arguments(self) -> Iterator[Argument]: + return itertools.chain( + self.arguments.flat_positional, + self.arguments.flat_kwarg_only, + self.arguments.out, + ) + + decl_re = re.compile(r"(?P[^\(]+)\((?P.*)\) -> (?P.*)") + + @staticmethod + def parse(func: str) -> FunctionSchema: + # We should probably get a proper parser here + decls = FunctionSchema.decl_re.findall(func) + assert len(decls) == 1, f"Invalid function schema: {func}" + ops, args, return_decl = decls[0] + name = OperatorName.parse(ops) + arguments = Arguments.parse(args) + returns = parse_returns(return_decl) + r = FunctionSchema(name=name, arguments=arguments, returns=returns) + assert str(r) == func, f"{str(r)} != {func}" + return r + + def returns_are_aliased(self) -> bool: + # We assert earlier that schemas can't have a mix of aliased and non-aliased returns + return any( + r + for r in self.returns + if r.annotation is not None and r.annotation.is_write + ) + + def __post_init__(self) -> None: + for arg, ret in zip(self.arguments.out, self.returns): + assert arg.annotation == ret.annotation, ( + "Out arguments must have matching return Tensor; furthermore, " + "the ith-argument needs to correspond to the ith return" + ) + # We also enforce that if you have any mutable, positional args, then they are not returned. + # This makes it easier to group these functions properly with their functional/out= counterparts. + for a in self.arguments.post_self_positional_mutable: + assert not any( + a.annotation == r.annotation for r in self.returns + ), f"If you have a schema with mutable positional args, we expect them to not be returned. schema: {str(self)}" + # Invariant: we expect out arguments to appear as keyword arguments in the schema. + # This means that all mutable returns should be aliased to a keyword argument + # (except for "self", which we explicitly don't treat as an out argument because of its use in methods) + # See Note [is_out_fn] + out_and_self = list(self.arguments.out) + [ + arg for arg in self.arguments.flat_positional if arg.name == "self" + ] + mutable_returns = [ + ret + for ret in self.returns + if ret.annotation is not None and ret.annotation.is_write + ] + immutable_returns = [ + ret + for ret in self.returns + if ret.annotation is None or not ret.annotation.is_write + ] + # Some assertions: We don't want any functions with a return type of "-> (Tensor(a!), Tensor)", + # because: + # (1) It's more annoying to handle properly + # (2) It's unnecessary - you can't method-chain on the first (mutated) output because it's part of a tuple. + # Instead, we expect the (a!) argument to not be returned. + assert ( + len(mutable_returns) == 0 or len(immutable_returns) == 0 + ), f"NativeFunctions must have either only mutable returns, or only immutable returns. Found: {str(self)}" + for ret in mutable_returns: + assert any(ret.annotation == arg.annotation for arg in out_and_self), ( + 'All mutable returns must be aliased either to a keyword argument, or to "self". ' + "Did you forget to mark an out argument as keyword-only?" + ) + if self.arguments.out: + # out= ops that return their mutable inputs are only really useful for method chaining. + # And method chaining is only really useful if the thing you're returning is a plain Tensor. + # So ideally, we'd enforce that out= ops with a single plain mutable tensor should return the tensor, + # and all other types of out= op schemas should return void. + # There are a bunch of existing out= ops that return tuples of tensors though, so we're stuck with allowing that. + if any(a.type != BaseType(BaseTy.Tensor) for a in self.arguments.out): + assert ( + len(self.returns) == 0 + ), "out= ops that accept tensor lists as out arguments " + "are expected to have no return type (since you can't do method chaining on them)" + else: + # mutable keyword arguments whose name has _scratch_ prefix are + # scratch tensors for memory planning and should not be returned + assert len( + [ + arg + for arg in self.arguments.out + if not arg.name.startswith("_scratch_") + ] + ) == len( + self.returns + ), "Must return as many arguments as there are out arguments, or no return at all" + + if self.name.name.inplace: + self_a = self.arguments.self_arg + assert ( + self_a + and self_a.argument.annotation + and self_a.argument.annotation.is_write + ) + if self_a.argument.type == BaseType(BaseTy.Tensor): + # All inplace ops with an ordinary `Tensor self` argument should return self, + # to allow for method chaining. + assert ( + len(self.returns) == 1 + and self.returns[0].annotation == self_a.argument.annotation + ) + else: + # You can't method chain on non-tensor self arguments though (like a List[Tensor]) + # so in all other cases we expect the return type to be none. + assert len(self.returns) == 0 + + if self.arguments.tensor_options is not None: + assert self.kind() == SchemaKind.functional, ( + "Found an operator that is not functional or out variant, but has tensor options arguments." + "This is not allowed- tensor options arguments are only allowed for factory functions." + f"schema: {str(self)}" + ) + if self.is_functional_fn(): + assert self.kind() == SchemaKind.functional, ( + "Found an operator that is not functional, but its overload contains the string 'functional'." + "This is a special keyword in the codegen, please use a different overload name." + f"schema: {str(self)}" + ) + + def is_functional_fn(self) -> bool: + return "functional" in self.name.overload_name + + def is_out_fn(self) -> bool: + # Note [is_out_fn] + # + # out functions are the variants which take an explicit out= argument + # to populate into. We need to know if a schema corresponds to an + # out function for several reasons: + # + # - They codegen differently in C++ API + # - codegen to at::add_out rather than at::add + # - out argument is moved to front of C++ argument list + # + # out functions are DEFINED to be any function with a keyword-only + # argument that is mutable. In principle, this could lead to a + # false positive if you define a function that mutates a + # kwarg only argument, but this isn't the "true" output of this + # function. A more robust definition that would work in this + # case would also look at: + # + # - The output types. Out functions take in the arguments + # they mutate and then return them again; this is sort + # of "definitionally" what makes something an out function. + # Historically, we DO check this for consistency. + # - Correspondence with pure variant. An out function + # should have a signature equivalent to its pure variant, + # but just with extra kwargs for the output elements. This + # is difficult to actually check for and historically + # we only do this check in tools/ + return bool(self.arguments.out) + + def kind(self) -> SchemaKind: + """ + What kind of schema is this? A functional schema is one + that returns a newly allocated output; an inplace schema + modifies the self argument inplace; an out schema writes + the result into an explicitly provided out argument. + """ + is_out = bool(self.arguments.out) + is_scratch = bool( + [arg for arg in self.arguments.out if arg.name.startswith("_scratch_")] + ) + is_inplace = self.name.name.inplace + is_mutable = any( + a.annotation is not None and a.annotation.is_write + for a in self.arguments.post_self_positional + ) + assert not (is_out and is_inplace) + # out= and inplace schemas can also have post_self_positional mutable args, + # but we give precedence to out= and inplace when deciding the schema kind. + # Tradeoff: we probably don't want to have to teach codegen that looks at inplace ops + # to also worry about mutable post_self_positional arguments, + # but it seems like a much bigger lift to classify them has having a new schema kind. + # The number of ops that fit in this strange category is small enough that + # we can probably manually write code for them instead of forcing the codegen to handle them. + if is_inplace: + return SchemaKind.inplace + elif is_scratch: + assert ( + is_out + ), "invariant: all scratch operators are expected to be out= operators too" + return SchemaKind.scratch + elif is_out: + assert ( + not is_scratch + ), "We should not categorize a scratch op as an out variant. Check if the order of if statements are expected!" + return SchemaKind.out + elif is_mutable: + return SchemaKind.mutable + else: + return SchemaKind.functional + + # For every return: + # - If the return aliases an input, we return the input name + # - Otherwise, we return None. + # If return names were enforced to be consistent with aliasing information, then we wouldn't need this. + def aliased_return_names(self) -> list[str | None]: + outs: list[str | None] = [] + for r in self.returns: + aliased_args = [ + a + for a in self.arguments.flat_all + if a.annotation is not None and a.annotation == r.annotation + ] + if len(aliased_args) == 0: + outs.append(None) + elif len(aliased_args) == 1: + outs.append(aliased_args[0].name) + else: + aliased_names = ", ".join(a.name for a in aliased_args) + raise AssertionError( + f"Found a return ({r.name})that aliases multiple inputs ({aliased_names})" + ) + return outs + + def signature( + self, + *, + strip_default: bool = False, + strip_view_copy_name: bool = False, + keep_return_names: bool = False, + ) -> FunctionSchema: + """ + Certain schemas are 'related', in that they are simply + inplace/out/functional versions of the same function. This method + factors these schemas into the "core" functional signature which + is equal across all versions. + + Here is what normalization happens to the schema to convert + it to a signature: + - The overload name is stripped (name is retained, since + it expresses semantic content about what the function does) + - Inplace is set False + - Out arguments are stripped + - Mutable post_self_positional args are converted to returns + - Mutability annotations are stripped (this is sound + because you cannot overload on mutability annotation) + - Return names are stripped since they are not overloadable and + some variants have return names but some not + - TensorOptions are dropped + because out= variants of factory functions don't include them + (and we want to be able to pair up factory functions with their out variants) + + Finally, we want to be able to pair up related "view" and their + corresponding "view_copy" operators. We do this by optionally + stripping the trailing "_copy" from the base name. + + Example of a mutable op before and after: + + f.func (Mutable operator): + _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask) # noqa: B950 + + f.func (Corresponding functional operator): + _fused_moving_avg_obs_fq_helper.functional(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor running_min, Tensor running_max, Tensor scale, Tensor zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask, Tensor running_min_out, Tensor running_max_out, Tensor scale_out, Tensor zero_point_out) # noqa: B950 + + f.func.signature() output: + _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor running_min, Tensor running_max, Tensor scale, Tensor zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) # noqa: B950 + """ + + def strip_ret_annotation(r: Return) -> Return: + return Return( + name=r.name if keep_return_names else None, + type=r.type, + annotation=None, + ) + + base_name = self.name.name.base + if strip_view_copy_name: + if base_name.endswith("_copy"): + base_name = base_name.replace("_copy", "") + elif base_name.endswith("_scatter"): + base_name = base_name.replace("scatter", "inverse") + + # find mutable inputs that are not originally returned, and convert them to returns + returns_from_mutable_inputs = tuple( + # When we're grouping functions we strip the return names, + # but when we're generating the actual functional variants then we follow + # a convention for what to name the returns + Return( + name=f"{a.name}_out" if keep_return_names else None, + type=a.type, + annotation=None, + ) + for a in itertools.chain( + # Order is important here (otherwise e.g. inplace with mutable args + # and out= with mutable args won't have the same signature) + [self.arguments.self_arg.argument] + if self.arguments.self_arg is not None + else [], + self.arguments.out, + self.arguments.post_self_positional, + ) + if a.annotation is not None + and a.annotation.is_write + and not any(a.annotation == r.annotation for r in self.returns) + ) + original_returns = tuple(map(strip_ret_annotation, self.returns)) + # Ordering is important here. We expect the "mutable input" returns to come last. + returns = original_returns + returns_from_mutable_inputs + + args_sig = self.arguments.signature(strip_default=strip_default) + # See Note [bernoulli.p schema] + if str(self.name) == "bernoulli.p": + args_sig = Arguments.parse(str(args_sig).replace("float p", "float p=0.5")) + + return FunctionSchema( + name=OperatorName( + name=BaseOperatorName( + base=base_name, + inplace=False, + dunder_method=self.name.name.dunder_method, + ), + overload_name="", # stripped + ), + arguments=args_sig, + returns=returns, + ) + + def view_signature(self) -> FunctionSchema: + return self.signature(strip_view_copy_name=True) + + def with_name(self, name: OperatorName) -> FunctionSchema: + return FunctionSchema( + name=name, + arguments=self.arguments, + returns=self.returns, + ) + + @property + def modifies_arguments(self) -> bool: + return self.kind() in [SchemaKind.inplace, SchemaKind.out, SchemaKind.mutable] + + def has_symint(self) -> bool: + return self.arguments.has_symint_arg() + + def __str__(self) -> str: + all_arguments_str = str(self.arguments) + if len(self.returns) == 1: + returns = str(self.returns[0]) # omit parentheses + else: + returns = "(" + ", ".join(map(str, self.returns)) + ")" + return f"{self.name}({all_arguments_str}) -> {returns}" + + +# Here is the rest of the data model, described more briefly. + + +# Simplified version for what actually shows up in built-ins. +# Look at alias_info.h for expanded syntax. If you need the structure, +# you also need to make this structure recursive so it can be lined +# up with the type components too. For primitives this isn't really +# necessary +@dataclass(frozen=True) +class Annotation: + # Typically only has one element. Not actually a set so + # we can conveniently assume it is canonically ordered + alias_set: tuple[str, ...] + is_write: bool + alias_set_after: tuple[str, ...] + + @staticmethod + def parse(ann: str) -> Annotation: + # TODO: implement a proper parser if this gets more ugly + # Regex Explanation: + # Example: "a! -> a|b" + # Group #1: alias before optional '|', required. Matches the first + # character 'a' in the example + # Group #2: optional alias set after optional '|', matches empty string + # in the example + # Group #3: optional "is write" flag, matches '!' in the example. + # Group #4: optional section containing arrow, matches " -> a|b" in the + # example. + # Group #5: optional alias after set, supports wildcard, matches "a|b" + # in the example. + # Group #6: optional sub-section of alias after set, matches "|b" in the + # example. + m = re.match(r"^([a-z])(\|[a-z])*(!?)( -> (\*|[a-z](\|[a-z])*))?$", ann) + + assert m is not None, f"unrecognized alias annotation {ann}" + before_alias = m.group(1) + (m.group(2) if m.group(2) else "") + alias_set = tuple(before_alias.split("|")) + is_write = m.group(3) == "!" + assert not ( + is_write and len(alias_set) > 1 + ), f"alias set larger than 1 is not mutable, got {ann} instead." + after_set = tuple(m.group(5).split("|")) if m.group(5) else () + assert not ( + len(before_alias) > 1 and len(after_set) > 1 + ), f"before alias set and after alias set cannot be larger than 1 at the same time, got {ann} instead." + r = Annotation( + alias_set=alias_set, is_write=is_write, alias_set_after=after_set + ) + assert str(r) == ann, f"{r} != {ann}" + return r + + def __str__(self) -> str: + alias_set = "|".join(self.alias_set) + if self.is_write: + alias_set = f"{alias_set}!" + alias_set_after = "|".join(self.alias_set_after) + if alias_set_after: + alias_set = f'{alias_set}{" -> "}{alias_set_after}' + return alias_set + + +# The base class for the type system. This is also loosely modeled +# off of jit_type.h, but we've simplified the hierarchy to focus +# in on the aspects of the type system that matter for code generation +# (for example, there's no SingleElementType subclass anymore). +# You never actually construct a Type; usually it's going to be one +# of the subclasses. If Python had ADTs this would be one! +@dataclass(frozen=True) +class Type: + @staticmethod + def parse(t: str) -> Type: + r = Type._parse(t) + assert str(r) == t, f"{r} != {t}" + return r + + @staticmethod + def _parse(t: str) -> Type: + m = re.match(r"^(.+)\?$", t) + if m is not None: + return OptionalType(Type.parse(m.group(1))) + m = re.match(r"^(.+)\[([0-9]+)?\]$", t) + if m is not None: + size = int(m.group(2)) if m.group(2) is not None else None + return ListType(elem=Type.parse(m.group(1)), size=size) + + # '__torch__.torch.classes.' is the prefix for custom class + m = re.match(r"^__torch__\.torch\.classes\.([a-zA-Z0-9_.]+)$", t) + if m is not None: + return CustomClassType(m.group(1)) + try: + return BaseType(BaseTy[t]) + except KeyError as e: + raise RuntimeError(f"unrecognized type {t}") from e + + def __str__(self) -> str: + raise NotImplementedError + + # WARNING: These concepts are not very well-defined. For example, + # is "int?" nullable? How about "int?[]". They are defined + # so we can conveniently generate legacy Declarations.yaml but + # really we should probably just remove these at some point + + def is_base_ty_like(self, base_ty: BaseTy) -> bool: + raise NotImplementedError + + def is_tensor_like(self) -> bool: + return self.is_base_ty_like(BaseTy.Tensor) + + def is_generator_like(self) -> bool: + return self.is_base_ty_like(BaseTy.Generator) + + def is_symint_like(self) -> bool: + return self.is_base_ty_like(BaseTy.SymInt) + + def is_nullable(self) -> bool: + raise NotImplementedError + + def is_list_like(self) -> ListType | None: + raise NotImplementedError + + +# Base types are simple, atomic types with no further structure +class BaseTy(Enum): + Generator = auto() + ScalarType = auto() + Tensor = auto() + int = auto() + Dimname = auto() + DimVector = auto() + float = auto() + str = auto() + bool = auto() + Layout = auto() + Device = auto() + DeviceIndex = auto() + Scalar = auto() + MemoryFormat = auto() + QScheme = auto() + Storage = auto() + Stream = auto() + SymInt = auto() + SymBool = auto() + ConstQuantizerPtr = auto() # TODO: rename + GraphModule = auto() + + +@dataclass(frozen=True) +class BaseType(Type): + name: BaseTy + + def __str__(self) -> str: + return f"{self.name.name}" + + def is_base_ty_like(self, base_ty: BaseTy) -> bool: + return self.name == base_ty + + def is_nullable(self) -> bool: + return False + + def is_list_like(self) -> ListType | None: + return None + + def is_symint_like(self) -> bool: + return self.name == BaseTy.SymInt + + +# Optional types may be specified, or may also be validly given None +@dataclass(frozen=True) +class OptionalType(Type): + elem: Type + + def __str__(self) -> str: + return f"{self.elem}?" + + def is_base_ty_like(self, base_ty: BaseTy) -> bool: + return self.elem.is_base_ty_like(base_ty) + + def is_symint_like(self) -> bool: + return self.elem.is_symint_like() + + def is_nullable(self) -> bool: + return True + + def is_list_like(self) -> ListType | None: + return self.elem.is_list_like() + + +# A type representing a PyTorch custom class +@dataclass(frozen=True) +class CustomClassType(Type): + class_name: str + + def __str__(self) -> str: + """ + Return the class name will prefix __torch__.torch.classes + """ + return f"__torch__.torch.classes.{self.class_name}" + + def is_base_ty_like(self, base_ty: BaseTy) -> bool: + return False + + def is_symint_like(self) -> bool: + return False + + def is_nullable(self) -> bool: + """ + Assume a custom class is not nullable. + """ + return False + + def is_list_like(self) -> ListType | None: + return None + + +# List types specify that we may have multiples of an element. We +# also support explicit sizes on list types, but these have +# some nontrivial semantics! (However, for C++ API purposes, explicit +# sizes are mostly erased from the type system.) +# +# DANGER WILL ROBINSON: C++ elaboration depends on elem type; e.g., +# int[] elaborates differently than bool[3]! +@dataclass(frozen=True) +class ListType(Type): + elem: Type + size: int | None + + def __str__(self) -> str: + size = f"{self.size}" if self.size else "" + return f"{self.elem}[{size}]" + + def is_base_ty_like(self, base_ty: BaseTy) -> bool: + return self.elem.is_base_ty_like(base_ty) + + def is_symint_like(self) -> bool: + return self.elem.is_symint_like() + + def is_nullable(self) -> bool: + return self.elem.is_nullable() + + def is_list_like(self) -> ListType | None: + return self + + +@dataclass(frozen=True) +class Argument: + # NB: I didn't put kwarg_only as a boolean field here, unlike + # c10::Argument, so that printing works correctly + + name: str + type: Type + default: str | None + + # The semantics of the annotation field are a little strange. + # + # Alias annotations parametrize Tensors (since Tensors are the only things + # that can alias.) This motivates why I write Tensor(a!)? (and not, for + # example, Tensor?(a!)), because the (a!) describes aliasing on the tensor, + # which may be optional (i.e., the alias annotation should bind first to + # Tensor, before the optional postfix annotation). + # + # However, despite being a property of Tensor, we (and c10::Argument) + # store the annotation at the top level of the Argument, rather than + # inside the embedded Tensor type. In the C++ version of this + # class, we then go through great lengths to mimic the type + # structure in the annotation structure so we can correlate + # annotations with types. + # + # Now, it turns out, in all applications in code generation, the + # structure of annotated types is very simple. So we just hard + # code it here. But if we ever do get anything more complex, this + # model will have to change! + annotation: Annotation | None + + @property + def alias_info(self) -> Annotation | None: + return self.annotation + + @staticmethod + def parse(arg: str) -> Argument: + name: str + default: str | None + assert " " in arg, f"illegal argument '{arg}'" + if "=" in arg: + assert arg.count("=") == 1, f"illegal argument with default value: '{arg}'" + type_and_annot_and_name, default = arg.split("=") + type_and_annot, name = type_and_annot_and_name.rsplit(" ", 1) + name_and_default = f"{name}={default}" + else: + type_and_annot, name_and_default = arg.rsplit(" ", 1) + name = name_and_default + default = None + # TODO: deduplicate annotation matching with Return + match = re.match(r"Tensor\((.+)\)(.*)", type_and_annot) + annotation: Annotation | None + if match: + # If you update this, make sure the __str__ still works too + assert match.group(2) in [ + "", + "?", + "[]", + ], "unrecognized alias analysis form with Tensor" + type_s = "Tensor" + match.group(2) + annotation = Annotation.parse(match.group(1)) + else: + type_s = type_and_annot + annotation = None + type = Type.parse(type_s) + r = Argument( + name=name, + type=type, + default=default, + annotation=annotation, + ) + assert str(r) == arg, f"{str(r)} != {arg}" + return r + + @property + def is_write(self) -> bool: + return self.annotation is not None and self.annotation.is_write + + def __str__(self) -> str: + type = f"{self.type}" + if self.annotation: + assert type in ["Tensor", "Tensor?", "Tensor[]"] + type = type.replace("Tensor", f"Tensor({self.annotation})") + if self.name is None: + return type + else: + mb_default = "" + if self.default: + mb_default = f"={self.default}" + return f"{type} {self.name}{mb_default}" + + +@dataclass(frozen=True) +class Return: + name: str | None + type: Type + annotation: Annotation | None + + @property + def alias_info(self) -> Annotation | None: + return self.annotation + + @staticmethod + def parse(arg: str) -> Return: + name: str | None + if " " in arg: + type_and_annot, name = arg.rsplit(" ", 1) + else: + type_and_annot = arg + name = None + match = re.match(r"Tensor\((.+)\)(.*)", type_and_annot) + annotation: Annotation | None + if match: + # If you update this, make sure the __str__ still works too + assert match.group(2) in [ + "", + "?", + "[]", + ], "unrecognized alias analysis form with Tensor" + type_s = "Tensor" + match.group(2) + annotation = Annotation.parse(match.group(1)) + else: + type_s = type_and_annot + annotation = None + type = Type.parse(type_s) + r = Return( + name=name, + type=type, + annotation=annotation, + ) + assert str(r) == arg, f"{str(r)} != {arg}" + return r + + @property + def is_write(self) -> bool: + return self.annotation is not None and self.annotation.is_write + + def __str__(self) -> str: + type = f"{self.type}" + if self.annotation: + assert type in ["Tensor", "Tensor?", "Tensor[]"] + type = type.replace("Tensor", f"Tensor({self.annotation})") + if self.name is None: + return type + else: + return f"{type} {self.name}" + + +# Represents the self argument for functions that may be methods +@dataclass(frozen=True) +class SelfArgument: + argument: Argument + + +# Bundle of arguments that represent a TensorOptions. This is mostly +# relevant for the public C++ API but we bake it into the core data +# model because other APIs often have to interact with it +@dataclass(frozen=True) +class TensorOptionsArguments: + dtype: Argument + layout: Argument + device: Argument + pin_memory: Argument + + def all(self) -> Sequence[Argument]: + return [self.dtype, self.layout, self.device, self.pin_memory] + + +@dataclass(frozen=True) +class Arguments: + # pre_self_positional is usually empty, but is notably non-empty + # for where.self, where the condition argument comes before the + # self argument + pre_self_positional: tuple[Argument, ...] + self_arg: SelfArgument | None + post_self_positional: tuple[Argument, ...] + + pre_tensor_options_kwarg_only: tuple[Argument, ...] + tensor_options: TensorOptionsArguments | None + # post_tensor_options is typically memory format, which should be + # part of tensor options but isn't right now, and is usually + # placed after the tensor options arguments + post_tensor_options_kwarg_only: tuple[Argument, ...] + + # Unlike in the previous codegen, we have factored out 'out' arguments + # in the canonical representation, removing them from kwarg + # arguments. This choice is justified by numerous downstream + # transformations which treat out arguments specially; additionally, + # you can see that canonicity is not violated! + out: tuple[Argument, ...] # these are also kwarg-only + + @property + def flat_non_out(self) -> Sequence[Argument]: + ret: list[Argument] = [] + ret.extend(self.flat_positional) + ret.extend(self.flat_kwarg_only) + return ret + + @property + def flat_positional(self) -> Sequence[Argument]: + ret: list[Argument] = [] + ret.extend(self.pre_self_positional) + if self.self_arg is not None: + ret.append(self.self_arg.argument) + ret.extend(self.post_self_positional) + return ret + + @property + def post_self_positional_mutable(self) -> Sequence[Argument]: + return [a for a in self.post_self_positional if a.is_write] + + # NB: doesn't contain out arguments + @property + def flat_kwarg_only(self) -> Sequence[Argument]: + ret: list[Argument] = [] + ret.extend(self.pre_tensor_options_kwarg_only) + if self.tensor_options is not None: + ret.extend(self.tensor_options.all()) + ret.extend(self.post_tensor_options_kwarg_only) + return ret + + @property + def flat_all(self) -> Sequence[Argument]: + ret: list[Argument] = [] + ret.extend(self.flat_positional) + ret.extend(self.flat_kwarg_only) + ret.extend(self.out) + return ret + + @property + def non_out( + self, + ) -> Sequence[Argument | SelfArgument | TensorOptionsArguments]: + ret: list[Argument | SelfArgument | TensorOptionsArguments] = [] + ret.extend(self.positional) + ret.extend(self.kwarg_only) + return ret + + @property + def positional(self) -> Sequence[Argument | SelfArgument]: + ret: list[Argument | SelfArgument] = [] + ret.extend(self.pre_self_positional) + if self.self_arg is not None: + ret.append(self.self_arg) + ret.extend(self.post_self_positional) + return ret + + @property + def kwarg_only(self) -> Sequence[Argument | TensorOptionsArguments]: + ret: list[Argument | TensorOptionsArguments] = [] + ret.extend(self.pre_tensor_options_kwarg_only) + if self.tensor_options is not None: + ret.append(self.tensor_options) + ret.extend(self.post_tensor_options_kwarg_only) + return ret + + @property + def all(self) -> Sequence[Argument | SelfArgument | TensorOptionsArguments]: + ret: list[Argument | SelfArgument | TensorOptionsArguments] = [] + ret.extend(self.positional) + ret.extend(self.kwarg_only) + ret.extend(self.out) + return ret + + def mutable_arg_names(self) -> list[str]: + return [ + a.name + for a in self.flat_all + if a.annotation is not None and a.annotation.is_write + ] + + def has_tensor_arg(self) -> bool: + return any(a.type.is_tensor_like() for a in self.flat_non_out) + + def has_symint_arg(self) -> bool: + return any(a.type.is_symint_like() for a in self.flat_non_out) + + def has_generator_arg(self) -> bool: + return any(a.type.is_generator_like() for a in self.flat_non_out) + + def signature(self, *, strip_default: bool = False) -> Arguments: + # dataclasses.replace could be used here, but it is less + # type safe so for now I've opted to type everything out + def strip_arg_annotation(a: Argument) -> Argument: + return Argument( + name=a.name, + type=a.type, + default=a.default if not strip_default else None, + annotation=None, + ) + + return Arguments( + pre_self_positional=tuple( + map(strip_arg_annotation, self.pre_self_positional) + ), + self_arg=SelfArgument(strip_arg_annotation(self.self_arg.argument)) + if self.self_arg is not None + else None, + post_self_positional=tuple( + map(strip_arg_annotation, self.post_self_positional) + ), + # Since TensorOptions are dropped, the post_tensor_options_kwargs are + # converted to pre_tensor_options_kwargs + pre_tensor_options_kwarg_only=tuple( + map(strip_arg_annotation, self.pre_tensor_options_kwarg_only) + ) + + tuple(map(strip_arg_annotation, self.post_tensor_options_kwarg_only)), + # TensorOptions are dropped in signature, + # so we can pair factory functions with their out= variants. + tensor_options=None, + post_tensor_options_kwarg_only=(), + # out arguments are dropped in signature + out=(), + ) + + def remove_self_annotation(self) -> Arguments: + assert self.self_arg is not None + return dataclasses.replace( + self, + self_arg=SelfArgument( + dataclasses.replace(self.self_arg.argument, annotation=None) + ), + ) + + def with_out_args(self, outs: list[Argument]) -> Arguments: + assert len(self.out) == 0 + return dataclasses.replace( + self, + out=tuple(outs), + ) + + @staticmethod + def _preparse(args: str) -> tuple[list[Argument], list[Argument], list[Argument]]: + positional: list[Argument] = [] + kwarg_only: list[Argument] = [] + out: list[Argument] = [] + arguments_acc = positional + + # TODO: Use a real parser here; this will get bamboozled + # by signatures that contain things like std::array (note the space) + for arg in args.split(", "): + if not arg: + continue + if arg == "*": + assert ( + arguments_acc is positional + ), "invalid syntax: kwarg-only specifier * can only occur once" + arguments_acc = kwarg_only + continue + parg = Argument.parse(arg) + # Currently, we rely directly on the invariant that there are NO + # kwarg-only mutating arguments. If you want to relax this, + # we will need a more semantic way of matching that takes + # into account return arguments. In that case, you will have + # to manage out computation a level up, in FunctionSchema. See Note + # [is_out_fn] + if parg.annotation is not None and parg.annotation.is_write: + if arguments_acc is positional: + pass # do nothing + elif arguments_acc is kwarg_only: + arguments_acc = out + else: + assert arguments_acc is not out + arguments_acc.append(parg) + + return positional, kwarg_only, out + + @staticmethod + def parse(args: str) -> Arguments: + """ + Input: 'int x, int y, int z' + """ + + # We do this in two phases. First we parse into three + # main categories: positional, kwarg_only, out. + # Then, we reparse positional and kwarg_only to separate + # out the self argument and tensor options arguments. + + positional, kwarg_only, out = Arguments._preparse(args) + + # Split self argument + self_ix = None + for i, a in enumerate(positional): + if a.name == "self": + self_ix = i + break + pre_self_positional: list[Argument] + self_arg: SelfArgument | None + post_self_positional: list[Argument] + if self_ix is not None: + pre_self_positional = positional[:self_ix] + self_arg = SelfArgument(positional[self_ix]) + post_self_positional = positional[self_ix + 1 :] + else: + pre_self_positional = [] + self_arg = None + post_self_positional = positional + + # Group tensor options arguments + pre_tensor_options_kwarg_only: list[Argument] = [] + tensor_options: TensorOptionsArguments | None = None + post_tensor_options_kwarg_only: list[Argument] = [] + kwarg_only_acc = pre_tensor_options_kwarg_only + + def pred(name: str, ty: Type) -> Callable[[Argument], bool]: + return lambda a: a.name == name and a.type in [ty, OptionalType(ty)] + + predicates = [ # order matters + pred("dtype", Type.parse("ScalarType")), + pred("layout", Type.parse("Layout")), + pred("device", Type.parse("Device")), + pred("pin_memory", Type.parse("bool")), + ] + + i = 0 + while i < len(kwarg_only): + # If there is enough space... + if i <= len(kwarg_only) - len(predicates): + # And the next len(predicates) arguments look like TensorOptions arguments + if all( + p(a) + for p, a in zip(predicates, kwarg_only[i : i + len(predicates)]) + ): + assert kwarg_only_acc is pre_tensor_options_kwarg_only + # Group them together as one argument + tensor_options = TensorOptionsArguments( + dtype=kwarg_only[i], + layout=kwarg_only[i + 1], + device=kwarg_only[i + 2], + pin_memory=kwarg_only[i + 3], + ) + i += len(predicates) + kwarg_only_acc = post_tensor_options_kwarg_only + continue + kwarg_only_acc.append(kwarg_only[i]) + i += 1 + + return Arguments( + pre_self_positional=tuple(pre_self_positional), + self_arg=self_arg, + post_self_positional=tuple(post_self_positional), + pre_tensor_options_kwarg_only=tuple(pre_tensor_options_kwarg_only), + tensor_options=tensor_options, + post_tensor_options_kwarg_only=tuple(post_tensor_options_kwarg_only), + out=tuple(out), + ) + + def __str__(self) -> str: + all_arguments: list[str] = [] + all_arguments.extend(map(str, self.flat_positional)) + if self.flat_kwarg_only or self.out: + all_arguments.append("*") + all_arguments.extend(map(str, self.flat_kwarg_only)) + all_arguments.extend(map(str, self.out)) + return ", ".join(all_arguments) + + def __post_init__(self) -> None: + # TODO: These invariants are weirdly asymmetric? + # TODO: Fancier types? + if self.self_arg is None: + assert not self.pre_self_positional + if self.tensor_options is None: + assert not self.post_tensor_options_kwarg_only + + # We don't allow any of the following to have argument annotations, + # to keep things simple. + mutable_pre_self_positionals = [ + a + for a in self.pre_self_positional + if a.annotation is not None and a.annotation.is_write + ] + assert ( + len(mutable_pre_self_positionals) == 0 + ), "mutable pre_self_positional arguments are not currently supported in the schema" + + +# Names that validly are __iXXX__ indicating inplace operations. +# Taken from https://www.python.org/dev/peps/pep-0203/#new-methods +# NB: PyTorch hasn't actually implemented all of these +AUGMENTED_ASSIGNMENT_NAMES = [ + "add", + "sub", + "mul", + "div", + "mod", + "pow", + "lshift", + "rshift", + "and", + "xor", + "or", +] + + +# A BaseOperatorName is what we think of the operator name, without +# the overload name. Unusually, we don't represent this as just a +# string; instead, we directly represent a few important semantic +# bits of information we derive from the string: namely whether +# or not it's inplace (add_) and whether or not it's a double-underscore +# method (__add__) +@dataclass(frozen=True) +class BaseOperatorName: + base: str + inplace: bool + dunder_method: bool + # Note [Overload Ambiguity With Functional Variants] + # A handful of operators have both a "mutable" and a "functional" variant. + # (native_batch_norm is a good example, although this isn't the case today). + # For those operators, the mutable and functional variant take in the same set of + # arguments, but have different alias annotations. + # this makes it ambiguous when you try to resolve an OverloadPacket into an overload, + # given a set of input arguments. + # + # So instead of making the "functional" variant in this case a real overload, e.g: + # native_batch_norm (mutable variant) + # native_batch_norm.functional (functional variant) + # we make it a new base operator, + # native_batch_norm_functional (functional variant) + # + # In an ideal world, we would probably invert this so the operators were: + # native_batch_norm.mutable (mutable variant) + # native_batch_norm (functional variant) + # + # Doing that is BC-breaking though, so we're stuck with the above modeling. + functional_overload: bool = False + + @staticmethod + def parse(op: str) -> BaseOperatorName: + assert op != "" + assert not op.endswith("_out"), ( + "_out suffix is reserved and not permitted for operator names; " + "did you mean to specify an out overload name instead?" + ) + m = re.match(r"^__([^_]+)__$", op) + if m is not None: + dunder_method = True + base = m.group(1) + if any(base == f"i{n}" for n in AUGMENTED_ASSIGNMENT_NAMES): + inplace = True + base = base[1:] + else: + inplace = False + # temporary, this is not intrinsically true but + # has been historically true for dunder methods + # we support (but, if we ever got, say, __int__, this would + # be wrong!) + assert base[0] != "i" + else: + dunder_method = False + base = op + if base[-1] == "_": + inplace = True + base = base[:-1] + else: + inplace = False + + # See Note [Overload Ambiguity With Functional Variants] + functional_suffix = "_functional" + if base.endswith(functional_suffix): + functional_overload = True + base = base[: -len(functional_suffix)] + # This seems complicated and unnecessary, so banning dunder methods + # for now on ops that have a functional + mutable variant (like native_batch_norm). + assert not dunder_method and not inplace + else: + functional_overload = False + + r = BaseOperatorName( + base=base, + inplace=inplace, + dunder_method=dunder_method, + functional_overload=functional_overload, + ) + assert str(r) == op, f"{str(r)} != {op}" + return r + + def __str__(self) -> str: + if self.dunder_method: + i = "i" if self.inplace else "" + return f"__{i}{self.base}__" + else: + i = ( + "_" + if self.inplace + else "_functional" + if self.functional_overload + else "" + ) + return f"{self.base}{i}" + + +# Operator name is the base operator name along with the (typically not +# user visible) overload string. +@dataclass(frozen=True) +class OperatorName: + name: BaseOperatorName + overload_name: str + + @staticmethod + def parse(op_name: str) -> OperatorName: + if "." in op_name: + name, overload_name = op_name.split(".", 1) + else: + name = op_name + overload_name = "" + r = OperatorName(name=BaseOperatorName.parse(name), overload_name=overload_name) + assert str(r) == op_name, f"{str(r)} != {op_name}" + return r + + def __str__(self) -> str: + if self.overload_name: + return f"{self.name}.{self.overload_name}" + else: + return f"{self.name}" + + # NB: This must be synchronized with the naming scheme in + # aten/src/ATen/templates/Operators.h + # Given a function schema "aten::op.overload(...)", + # If there is no overload name, this returns f"{op}" + # If there is an overload name, this returns f"{op}_{overload}" + def unambiguous_name(self) -> str: + if self.overload_name: + return f"{self.name}_{self.overload_name}" + else: + return f"{self.name}" + + def remove_inplace(self) -> OperatorName: + return OperatorName( + name=BaseOperatorName( + base=self.name.base, + inplace=False, + dunder_method=self.name.dunder_method, + ), + overload_name=self.overload_name, + ) + + def with_overload(self, overload: str) -> OperatorName: + return OperatorName( + name=BaseOperatorName( + base=self.name.base, + inplace=False, + dunder_method=self.name.dunder_method, + ), + overload_name=overload, + ) + + +def gets_generated_out_inplace_wrapper( + f: NativeFunction, g: NativeFunctionsGroup, b: BackendIndex +) -> bool: + return ( + f.func.kind() is not SchemaKind.functional + and not b.has_kernel(f) + and b.has_kernel(g.functional) + ) + + +# NativeFunction objects that are views (f.is_view_op returns True) +# are added into a `NativeFunctionsViewGroup`, which we can use to +# easily access the generated (optional) view_copy NativeFunction. +# It's convenient to group them together, so we pair them up in NativeFunctionsViewGroup. +# See Note [Codegen'd {view}_copy Operators] +# +# One property of this representation is that in order for a view-like op to be part of +# a NativeFunctionsViewGroup, the "aliasing" version of that view op must exist. +# There's one case where that doesn't happen: we have a non-aliasing `narrow_copy.out` op, +# but don't have corresponding aliasing `narrow.out` op. +# This means that `narrow_copy.out` won't appear as a NativeFunctionsViewGroup. +@dataclass(frozen=True) +class NativeFunctionsViewGroup: + view: NativeFunction + # Note: the {view}_copy operator is optional because we currently don't generate copy variants + # for all view ops. Notably, we don't generate them for CompositeImplicitAutograd views + # (we already get them "for free" through decomposition) + view_copy: NativeFunction | None + # view_inplace ops are also optional, but every view_inplace op should have out-of-place variant. + view_inplace: NativeFunction | None + + def __post_init__(self) -> None: + assert self.view.is_view_op + if self.view_copy is None: + assert not gets_generated_view_copy(self.view), ( + f"{str(self.view.func.name)} appears to be a new operator that aliases its inputs." + " The codegen expects you to add a corresponding operator to native_functions.yaml:" + f" {get_view_copy_name(self.view)!s}." + " See Note [view_copy NativeFunctions] for details." + ) + else: + assert self.view_copy.func.name.name.base.endswith(("_copy", "_scatter")) + assert self.view.func.signature() == self.view_copy.func.signature( + strip_view_copy_name=True, + ) + assert "view_copy" in self.view_copy.tags, ( + f"{str(self.view_copy.func.name), str(self.view.tags)} appears to be a view_copy operator. The codegen expects" + " view_copy operators to be annotated with the 'view_copy' tag in native_functions.yaml." + " See Note [view_copy NativeFunction] for details." + ) + if self.view_inplace is not None: + assert self.view.func.signature() == self.view_inplace.func.signature() + + if self.view.has_composite_implicit_autograd_kernel: + if self.view_inplace is not None: + assert self.view_inplace.has_composite_implicit_autograd_kernel, ( + f"{str(self.view.func.name)} and {str(self.view_inplace.func.name)} must either" + " both have CompositeImplicitAutograd kernels, or both not have composite kernels." + ) + if self.view.has_composite_implicit_autograd_nested_tensor_kernel: + if self.view_inplace is not None: + assert ( + self.view_inplace.has_composite_implicit_autograd_nested_tensor_kernel + ), ( + f"{str(self.view.func.name)} and {str(self.view_inplace.func.name)} must either" + " both have CompositeImplicitAutogradNestedTensor kernels, or both not have composite kernels." + ) + + def functions(self, *, include_copy: bool = True) -> Iterator[NativeFunction]: + yield self.view + if self.view_inplace is not None: + yield self.view_inplace + if self.view_copy is not None and include_copy: + yield self.view_copy + + @property + def root_name(self) -> str: + return self.view.root_name + + @property + def composite(self) -> bool: + # We currently assert that the "group" is consistent. + # If the view op is composite, then its view_inplace op is too. + return self.view.has_composite_implicit_autograd_kernel + + +def gets_generated_view_copy(f: NativeFunction) -> bool: + # Only aliasing (view) operators get a copy variant. + if not f.is_view_op: + return False + # We don't need to bother generating copy variants for CompositeImplicitAutograd ops, + # because we can let them decompose into base view ops. + if f.has_composite_implicit_autograd_kernel: + return False + # We also don't need to generate copy variants for inplace views. + if "inplace_view" in f.tags: + return False + # Assume ops ending in _inverse have manually-defined copy variants + # (e.g. slice_inverse() has the copy variant slice_scatter()). + # We -could- probably generate these as well, but the codegen will be + # slightly different, and hand-writing these few kernels keeps codegen + # complexity lower. + if f.func.name.name.base.endswith("_inverse"): + return False + return True + + +# Given a NativeFunction that corresponds to a view op, +# returns the OperatorName of the corresponding "copy" variant of the op. +def get_view_copy_name(f: NativeFunction) -> OperatorName: + # Right now, when asking for a view op's corresponding "view_copy" name + # we assert for sanity that the op is allowed to have a generated view_copy variant. + # (We can do this because "gets_generated_view_copy()" tell us which ops get a generated view_copy op). + # However, narrow_copy() already exists as an op directly in native_functions.yaml. + # I'm hardcoding narrow_copy here for now to maintain the assert, + # But we could also just get rid of the assert. + list_of_ops_with_explicit_view_copy_operators = ["narrow"] + if str(f.func.name) not in list_of_ops_with_explicit_view_copy_operators: + assert gets_generated_view_copy(f) + + base_name = f"{f.func.name.name.base}_copy" + view_copy_name = OperatorName( + name=BaseOperatorName( + base=base_name, inplace=False, dunder_method=f.func.name.name.dunder_method + ), + overload_name=f.func.name.overload_name, + ) + return view_copy_name + + +# Helper functions for parsing argument lists (both inputs and returns) + + +def parse_returns(return_decl: str) -> tuple[Return, ...]: + """ + Input: '()' + Output: [] + """ + if return_decl == "()": + return () + if return_decl[0] == "(" and return_decl[-1] == ")": + return_decl = return_decl[1:-1] + return tuple(Return.parse(arg) for arg in return_decl.split(", ")) + + +# A Precompute instance consists of a map from kernel argument name +# to the list of Argument instances that should replace that +# kernel argument in the impl function. +@dataclass(frozen=True) +class Precompute: + # A map from kernel argument name -> a list of precomputed + # elements that replaces/supersedes it. + replace: dict[str, list[Argument]] + # List of precomputed args added without replacement + add: list[Argument] + + @staticmethod + def parse(src: object) -> Precompute: + assert isinstance(src, list) + + # src is a list of strings of the format: + # {kernel param name} -> {replacement decl}[, {replacement decl}, ...] + # [{add decl}[, {add decl}, ...]] + # The last line is optional and contains the precomputed parameters that are + # added without replacement. + # The other lines are parsed to get the names of which precomputed elements + # should replace which kernel arguments. + add_args = [] + if " -> " not in src[-1]: + add_list = src[-1].split(",") + add_args = [Argument.parse(name.strip()) for name in add_list] + src = src[:-1] + + replace = {} + for raw_replace_item in src: + assert isinstance(raw_replace_item, str) + assert " -> " in raw_replace_item, ( + "precomputed parameters without replacement" + " are allowed only in the last line" + ) + + arg, with_list_raw = raw_replace_item.split(" -> ") + assert ( + " " not in arg + ), f"illegal kernel param name '{arg}' in precomputed parameters'" + with_list = with_list_raw.split(",") + with_list_args = [Argument.parse(name.strip()) for name in with_list] + replace[arg] = with_list_args + + r = Precompute(replace=replace, add=add_args) + assert r.to_list() == src, "r.to_list() != src" + return r + + def __post_init__(self) -> None: + # the template parameters are upper so if these are the + # same then it is ambiguous + for a in self.add: + assert a.name.upper() != a.name + for args in self.replace.values(): + for a in args: + assert a.name.upper() != a.name + + def to_list(self) -> list[str]: + replace_list = [] + for kernel_param, replacement_params in self.replace.items(): + replacements = ", ".join(str(param) for param in replacement_params) + replace_list.append(f"{kernel_param} -> {replacements}") + + return replace_list diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py new file mode 100644 index 00000000000..a44efab6842 --- /dev/null +++ b/torchgen/native_function_generation.py @@ -0,0 +1,646 @@ +from __future__ import annotations + +from collections import defaultdict +from typing import Sequence + +import torchgen.api.dispatcher as dispatcher +from torchgen.api.translate import translate +from torchgen.api.types import Binding, DispatcherSignature, Expr +from torchgen.context import with_native_function +from torchgen.model import ( + Annotation, + Argument, + BackendIndex, + BackendMetadata, + BaseOperatorName, + BaseTy, + BaseType, + DEFAULT_KERNEL_NAMESPACE, + DeviceCheckType, + DispatchKey, + FunctionSchema, + NativeFunction, + NativeFunctionsGroup, + OperatorName, + Return, + SchemaKind, + Variant, +) +from torchgen.utils import concatMap + + +# See Note: [Out ops with functional variants that don't get grouped properly] +OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY = [ + # This has a functional variant, but it's currently marked private. + # This function should be marked private as well (*_backward ops aren't exposed to python anyway). + "adaptive_avg_pool3d_backward.grad_input", + # There's a functional variant, _slow_conv2d_backward.output_mask, that isn't grouped properly. + # Maybe we can kill this operator in favor of convolution_backward? + "_slow_conv2d_backward.grad_input", +] + + +# See Note: [Mutable ops that cannot get an out variant] +MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT = [ + # should be out=? + "_cummax_helper", + # should be out=? + "_cummin_helper", +] + +# All of these operators don't have any tensor like returns +FUNCTIONAL_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT = [ + "_assert_async", # no return + "_assert_async.msg", # no return + "_cslt_sparse_mm_search", # returns an int + "_assert_scalar", # no return + "_dimI", # returns an int + "_dimV", # returns an int + "_has_same_storage_numel", # returns a boolean + "_linalg_check_errors", # no return + "_local_scalar_dense", # returns a Scalar + "_nested_tensor_from_mask_left_aligned", # returns a boolean + "_nnz", # returns an int + "_use_cudnn_ctc_loss", # returns a boolean + "_use_cudnn_ctc_loss.Tensor", # returns a boolean + "_validate_compressed_sparse_indices", # no return + "allclose", # returns a boolean + "dense_dim", # returns an int + "equal", # returns a boolean + "is_coalesced", # returns an boolean + "is_pinned", # returns a boolean + "is_same_size", # returns a boolean + "is_set_to", # returns a boolean + "q_per_channel_axis", # returns an int + "q_scale", # returns a float + "q_zero_point", # returns an int + "qscheme", # returns a QScheme + "record_stream", # no return + "sparse_dim", # returns an int + "sym_constrain_range", # no return + "sym_constrain_range_for_size", # no return + "_nested_tensor_storage_offsets", # returns a vector of ints + "_chunk_grad_outputs_efficient_attention", # returns a bool + "_fused_sdp_choice", # returns an int + "_print", # no return + "_sink_tokens", # no return + "_nested_get_ragged_idx", # returns an int +] + +INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY = [ + # polygamma and polygamma.out both exist, but have a + # pre-self arg (while polygamma_ does not) + # We should either fix this schema so it can be grouped properly, + # or allow the codegen to generate new functional/out= NativeFunctions for this op + # (which would require changing its overload name to prevent overload ambiguity). + "polygamma_" +] + + +# Groups "similar" NativeFunctions together +# example add.Tensor, add_.Tensor, add.out +# "similar" NativeFunctions are all expected to have an identical `signature()`, +# But have differing SchemaKinds. +def pre_group_native_functions( + native_functions: Sequence[NativeFunction], +) -> dict[FunctionSchema, dict[SchemaKind, NativeFunction]]: + pre_grouped_native_functions: dict[ + FunctionSchema, dict[SchemaKind, NativeFunction] + ] = defaultdict(dict) + for f in native_functions: + d = pre_grouped_native_functions[f.func.signature()] + assert f.func.kind() not in d + d[f.func.kind()] = f + return pre_grouped_native_functions + + +# Returns the out variant overload name given a base function overload name +def get_expected_out_variant_overload_name(overload_name: str | None) -> str: + return "out" if not overload_name else f"{overload_name}_out" + + +# Helper function: given an inplace FunctionSchema, generate its corresponding out= variant +# Example before: +# _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) +# Example after: +# _add_relu.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) +def self_to_out_signature(func: FunctionSchema) -> FunctionSchema: + # Generating an out= schema from an inplace schema. + assert func.kind() == SchemaKind.inplace + assert func.arguments.self_arg is not None + # The new out= schema has: + # - a new out argument with the same type as "func" (but with a mutable annotation) + # - The returns (if any) now alias the out= argument instead of "func" + # - an "out" overload name + return FunctionSchema( + name=func.name.remove_inplace().with_overload( + get_expected_out_variant_overload_name(func.name.overload_name) + ), + arguments=func.arguments.remove_self_annotation().with_out_args( + [ + Argument( + name="out", + type=func.arguments.self_arg.argument.type, + default=None, + annotation=func.arguments.self_arg.argument.annotation, + ) + ] + ), + returns=func.returns, + ) + + +# Helper function: given a functional FunctionSchema, generate its corresponding out= variant +# Example before: +# _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, +# bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor +# Example after: +# _to_copy._out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, +# Tensor(a!) out) -> Tensor(a!) +def functional_to_out_signature(func: FunctionSchema) -> FunctionSchema: + # Generating an out= schema from a functional schema. + assert func.kind() == SchemaKind.functional + + new_returns, new_out_args = generate_out_args_from_schema(func) + # The new out= schema has: + # - one or more new out argument(s) with the same type as returns (but with a mutable annotation) + # - The returns now alias the out= arguments + # - an "_out" overload name + return FunctionSchema( + name=func.name.with_overload( + get_expected_out_variant_overload_name(func.name.overload_name) + ), + arguments=func.arguments.signature().with_out_args( + new_out_args, + ), + returns=tuple(new_returns), + ) + + +# Helper function: given a function schema, generate corresponding out arguments, also the updated return annotations. +def generate_out_args_from_schema( + func: FunctionSchema, +) -> tuple[list[Return], list[Argument]]: + # More of a sanity check - our existing restrictions on schemas should enforce that + # mutable schema kinds never return their mutable arguments. + assert not any( + r.annotation is not None and r.annotation.is_write for r in func.returns + ) + + tensorlike_rets = [r for r in func.returns if r.type.is_tensor_like()] + assert len(tensorlike_rets) > 0 + + used_annotations = concatMap( + lambda a: [] if a.annotation is None else a.annotation.alias_set, + func.arguments.flat_all, + ) + valid_annotations = [ + x for x in "abcdefghijklmnopqrstuvwxyz" if x not in used_annotations + ] + + all_rets_are_tensors = all(r.type == BaseType(BaseTy.Tensor) for r in func.returns) + + new_out_args: list[Argument] = [] + # The end result of new_returns is that: + # - If every return is a plain tensor, then the new returns == the old returns, but with the out= alias annotations added. + # - Otherwise, none of the out arguments show up in the returns (and we're only left with non-tensor-like returns, if any). + new_returns: list[Return] = [] + for i, r in enumerate(func.returns): + if r.type.is_tensor_like(): + new_out = Argument( + name="out" if len(func.returns) == 1 else f"out{i}", + type=r.type, + default=None, + annotation=Annotation.parse(f"{valid_annotations[i]}!"), + ) + new_out_args.append(new_out) + if all_rets_are_tensors: + # The convention for out= schemas is that they only return their out arguments + # if the return is a plain Tensor (or if it's a tuple of plain Tensors) + new_ret = Return( + name=None, type=new_out.type, annotation=new_out.annotation + ) + new_returns.append(new_ret) + else: + new_returns.append(r) + return new_returns, new_out_args + + +# Helper function: given a mutable FunctionSchema, generate its corresponding out= variant +# Example before: +# _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask) # noqa: B950 +# Example after: +# _fused_moving_avg_obs_fq_helper._out(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False, *, Tensor(e!) out0, Tensor(f!) out1) -> (Tensor(e!), Tensor(f!)) # noqa: B950 +def mutable_to_out_signature(func: FunctionSchema) -> FunctionSchema: + # Generating an out= schema from a mutable schema. + assert func.kind() == SchemaKind.mutable + # The new out= schema has: + # - Any non-aliased tensor-like returns are converted to mutable, aliased out= arguments + # (if the argument is a tensor then we also return it for method chaining, + # otherwise we return nothing) + # - an "out" overload name + # + # Note that: + # (1) This also means that we can *only* generate an out= variant from a mutable schema + # if the mutable schema has at least one tensor-like non-aliasing return. + # (2) The generated out= variant still has mutable positional arguments, + # but if necessary we could probably add another out= variant that also + # functionalizes the mutable arguments (a functional_out variant) + + new_returns, new_out_args = generate_out_args_from_schema(func) + + return FunctionSchema( + name=func.name.remove_inplace().with_overload( + get_expected_out_variant_overload_name(func.name.overload_name) + ), + arguments=func.arguments.with_out_args(new_out_args), + returns=tuple(new_returns), + ) + + +# This function, given function of one SchemaKind, as well as a target SchemaKind, +# generates a new NativeFunction with the same properties, but using the target SchemaKind. +# We only actually generate functions for either functional or out= SchemaKinds. +# This function returns a tuple, with: +# - The generated NativeFunction +# - a dictionary of `BackendIndex` objects, describing which dispatch keys +# we will generate kernels for, for the new NativeFunction. +# Details are in the function, but we only generate composite kernels (in some cases) today. +def generate_function( + f: NativeFunction, k: SchemaKind +) -> tuple[NativeFunction, dict[DispatchKey, dict[OperatorName, BackendMetadata]]]: + from torchgen.api import cpp + + if k == SchemaKind.functional: + assert f.func.kind() != SchemaKind.functional + # The new "functional" NativeFunction has: + # - any mutable arguments have been converted into (immutable) returns. + # (if a mutable argument was not also a return, it gets converted to one) + # - "_functional" appended to the base name, ONLY IF this op has a mutable variant. + # See Note [Overload Ambiguity With Functional Variants] + # The default grouping logic in signature() actually already does this, + # so we can piggy-back off it (but we still want return names) + func = f.func.signature(keep_return_names=True).with_name( + OperatorName( + name=BaseOperatorName( + base=f.func.name.name.base, + inplace=False, + dunder_method=f.func.name.name.dunder_method, + # See Note [Overload Ambiguity With Functional Variants] + functional_overload=f.func.kind() == SchemaKind.mutable, + ), + overload_name=f.func.name.overload_name, + ) + ) + elif k == SchemaKind.out: + # We generate out= ops mostly just so that we can pair up NativeFunctions into groups easily, + # but at least today, there is no good reason to actually use them. + # we'll generate a dispatcher entry for them, but won't actually register any kernels for them. + if f.func.kind() == SchemaKind.inplace: + func = self_to_out_signature(f.func) + elif f.func.kind() == SchemaKind.mutable: + func = mutable_to_out_signature(f.func) + elif f.func.kind() == SchemaKind.functional: + func = functional_to_out_signature(f.func) + else: + raise AssertionError( + "We only bother generating out= functions from either inplace or mutable or functional variants" + ) + else: + raise AssertionError( + "We currently only generate either functional or out= NativeFunctions" + ) + + # Generated kernel naming convention for out: _. The reason for this is to + # disambiguate operator with the same name but different overload name, e.g., `randn.names_out` and + # `randn.generator_with_names_out`. + kernel_name = ( + func.name.unambiguous_name() + if func.kind() == SchemaKind.out + else cpp.name(func) + ) + if f.func.has_symint(): + kernel_name += "_symint" + backend_metadata = { + DispatchKey.CompositeExplicitAutograd: { + func.name: BackendMetadata( + kernel=kernel_name, + structured=False, + cpp_namespace=DEFAULT_KERNEL_NAMESPACE, + ) + } + } + tags = {"generated"} | set( + f.tags & {"nondeterministic_seeded", "view_copy", "pt2_compliant_tag"} + ) + + return ( + NativeFunction( + func=func, + use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors, + # These generated fn's aren't meant to be user friendly- don't generate methods. + variants={Variant.function}, + structured=False, + structured_delegate=None, + structured_inherits=None, + precomputed=None, + autogen=[], + ufunc_inner_loop={}, + manual_kernel_registration=False, + manual_cpp_binding=False, + python_module=None, + category_override=None, + device_guard=False, + device_check=DeviceCheckType.NoCheck, + loc=f.loc, + cpp_no_default_args=set(), + is_abstract=f.is_abstract, + has_composite_implicit_autograd_kernel=False, + has_composite_implicit_autograd_nested_tensor_kernel=False, + has_composite_explicit_autograd_kernel=True, + has_composite_explicit_autograd_non_functional_kernel=False, + # Every generated NativeFunction gets a "generated" tag, so it's easy to tell + # which NativeFunction objects did not come directly from native_functions.yaml. + tags=tags, + namespace=f.namespace, + ), + backend_metadata, + ) + + +# This function is responsible for adding generated NativeFunctions which don't appear +# explicitly in the codegen. +# You can inspect the full list of NativeFunctions yourself with the torchgen package, by running +# torchgen.parse_native_yaml("aten/src/ATen/native/native_functions.yaml", "aten/src/ATen/native/tags.yaml") +# (Maybe we should make a friendly API for this) +# +# Note: this function *mutates* its two inputs, +# adding the new NativeFunctions / BackendMetadata to them +def add_generated_native_functions( + rs: list[NativeFunction], + indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]], +) -> None: + # The main code for generating new NativeFunctions + # First we group of NativeFunctions by schema kind, + # then we detect which ones are missing and generate them. + pre_grouped_native_functions = pre_group_native_functions(rs) + for d in pre_grouped_native_functions.values(): + has_functional = SchemaKind.functional in d + has_inplace = SchemaKind.inplace in d + has_mutable = SchemaKind.mutable in d + has_out = SchemaKind.out in d + + # We automatically generate a few native functions that don't exist in the yaml, for a few reasons: + # (1) If an operator has an inplace/out= variant but no functional variant, we can generate + # a simple functional variant that the functionalization pass can consume. + # (2) If an operator has an inplace or functional but no out= variant, we generate an out= + # variant, mostly so we can easily pair up functions into NativeFunctionsGroup, + # while maintaining the constraint that the out= variant is "required". + if has_mutable or has_inplace or has_out or has_functional: + # Don't bother generating functions trio's for native functions that bypass the dispatcher. + are_manual = all(f.manual_cpp_binding for f in d.values()) + # Don't bother generating functional + out= variants for view operators + # set_ is technically an inplace_view, but for now it is treated + # as a normal inplace op in the codegen + has_view_ops = any( + f.is_view_op and str(f.func.name.name) != "set_" for f in d.values() + ) + # Don't generate the other variants for CompositeImplicitAutograd operators. + # We could probably do this, but the main benefit of generating the function triplets + # is for transforms that need them, and transforms don't need to act directly + # on CompositeImplicitAutograd operators (since we let them decompose). + are_composite_implicit = all( + f.has_composite_implicit_autograd_kernel for f in d.values() + ) + if are_manual or has_view_ops or are_composite_implicit: + continue + if has_out and len(d.values()) == 1: + # Note: [Out ops with functional variants that don't get grouped properly] + # In theory we could validly have an out= operator in native_functions.yaml + # that has no other variants. + # But today, all of the operators where that's the case actually do have + # functional variants, that we are just unable to pair up properly. + # I think banning this all together is probably safer + # (you can always add a functional variant yourself if you want to add a new out= operator). + # + # We should probably fix the existing cases; this check is to prevent us from adding more over time. + if ( + str(d[SchemaKind.out].func.name) + not in OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY + ): + raise AssertionError( + f"Found an out= operator that we could not find any other variants of: {str(d[SchemaKind.out].func)}" + ) + continue + + # Some inplace ops that have problematic schemas (that we should fix), which prevent us + # from generating out= and functional variants + if ( + has_inplace + and str(d[SchemaKind.inplace].func.name) + in INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY + ): + continue + + base_fn = ( + d[SchemaKind.inplace] + if has_inplace + else d[SchemaKind.mutable] + if has_mutable + else d[SchemaKind.out] + if has_out + else d[SchemaKind.functional] + ) + + # Note: [Mutable ops that cannot get an out variant] + # We can only generate an out= variant if either: + # - the original function has tensor-like returns (since we can convert them to out kwargs) + # - or it's inplace (since we can convert `self` to an out kwarg) + # There are only two functions that don't fit this criteria today though, + # and they both look like they should be fixed to be out= variants, + # so if feels safer to ban this schema all-together + base_fn_valid = base_fn.func.kind() == SchemaKind.inplace or any( + r.type.is_tensor_like() for r in base_fn.func.returns + ) + # Note: [Loosen the assertion that all functional should have out variant] + # By design all functional operators should have our variants. The needs_out check + # is loosening this requirement, changing it to only generate out variant if there's + # an `autogen` block in the native function, in the long run it should be removed. + # FIXME: Remove this after figuring out CI job failures related to min, max, mean + needs_out = any("out" in str(op_name) for op_name in base_fn.autogen) + gets_out_variant = not has_out and base_fn_valid and needs_out + if not has_out and not base_fn_valid: + if ( + str(base_fn.func.name) + not in MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT + and str(base_fn.func.name) + not in FUNCTIONAL_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT + ): + raise AssertionError( + f"""Found an operator that we could not generate an out= variant for: {str(base_fn.func)}. +This type of operators don't have tensor-like return, making it difficult to generate a proper out= variant. If +out= variant is not needed, please add the function name into FUNCTIONAL_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT list.""" + ) + + # Generate an out= variant + if gets_out_variant: + fn, metadata = generate_function(base_fn, SchemaKind.out) + d[SchemaKind.out] = fn + BackendIndex.grow_index(indices, metadata) + rs.append(fn) + + # Generate a functional variant, but only do it if the operator got an out= variant + # (Functional variants are only useful if we can group up the variants, + # which we can only do if they have an out= variant) + if not has_functional and (has_out or gets_out_variant): + fn, metadata = generate_function(base_fn, SchemaKind.functional) + d[SchemaKind.functional] = fn + BackendIndex.grow_index(indices, metadata) + rs.append(fn) + + +def return_str(rets: tuple[Return, ...], names: list[str]) -> str: + assert len(rets) == len(names) + if len(rets) == 0: + return "" + elif len(rets) == 1: + return f"return {names[0]};" + else: + return f"return {dispatcher.returns_type(rets).cpp_type()}({', '.join(names)});" + + +# Given a function, and the name of a variable corresponding to the output of that function, +# gather up all of the individual returns that are not aliased +def gather_nonaliased_inner_rets(func: FunctionSchema, out_var: str) -> list[str]: + aliased_rets = func.aliased_return_names() + non_aliased_names = [] + is_out_var_a_tuple = len(func.returns) > 1 + for i, r in enumerate(aliased_rets): + if r is None: + non_aliased_names.append( + f"std::get<{i}>({out_var})" if is_out_var_a_tuple else out_var + ) + return non_aliased_names + + +# Generates functional kernels in terms of their inplace.mutable counterparts. +# We only do this for "generated" NativeFunctions +@with_native_function +def gen_composite_functional_kernel(g: NativeFunctionsGroup) -> str | None: + # We should only be generating these for code-generated NativeFunctions + if "generated" not in g.functional.tags: + return None + # And we always write the kernel for a generated op in terms of a non-generated op. + if g.inplace is not None and "generated" not in g.inplace.tags: + target_f = g.inplace + elif g.mutable is not None and "generated" not in g.mutable.tags: + target_f = g.mutable + else: + # We should be guaranteed to have a valid inplace/mutable variant to call into. + # See Note: [Mutable Ops Not Using Functionalization] + raise AssertionError(str(g.functional.func)) + + sig = DispatcherSignature(g.functional.func) + target_sig = DispatcherSignature(target_f.func) + + context: list[Binding | Expr] = [] + clone_mutable_inputs = [] + cloned_return_names = [] + # We can't just directly pass all of the arguments from the functional op into the mutating op. + # We need to check for which inputs to the mutating operator are mutable, + # and clone those inputs first. + for a_curr, a_tgt in zip( + dispatcher.jit_arguments(g.functional.func), + dispatcher.jit_arguments(target_f.func), + ): + if a_tgt.annotation is not None and a_tgt.annotation.is_write: + clone_mutable_inputs.append( + f"auto {a_curr.name}_clone = clone_arg({a_curr.name});" + ) + context.append( + Expr( + expr=f"{a_curr.name}_clone", + type=dispatcher.argument_type(a_curr, binds=a_curr.name), + ) + ) + # Invariant: mutable arguments on the inner mutable op are always returns on the functional op. + cloned_return_names.append(f"{a_curr.name}_clone") + else: + context.append(dispatcher.argument(a_curr)) + exprs = ", ".join([e.expr for e in translate(context, target_sig.arguments())]) + + out_name = "output" + maybe_assign = f"auto {out_name} = " if len(target_f.func.returns) > 0 else "" + inner_return_names = gather_nonaliased_inner_rets(target_f.func, out_name) + ret_str = return_str( + g.functional.func.returns, inner_return_names + cloned_return_names + ) + + clone_mutable_inputs_str = "\n".join(clone_mutable_inputs) + return f""" +{sig.defn(name=sig.name() + ("_symint" if g.out.func.has_symint() else ""))} {{ + {clone_mutable_inputs_str} + {maybe_assign}at::_ops::{target_f.func.name.unambiguous_name()}::call({exprs}); + {ret_str} +}} +""" + + +# Generates out= kernels in terms of their functional counterparts. +# We only do this for "generated" NativeFunctions +@with_native_function +def gen_composite_out_kernel(g: NativeFunctionsGroup) -> str | None: + # We should only be generating these for code-generated NativeFunctions + if "generated" not in g.out.tags: + return None + # And we always write the kernel for the out= op in terms of the functional. + # Note that the functional op might have also been generated, but we don't have to + # worry about cycles, because the generated functional kernels are always implemented + # in terms of non-generated kernels (see gen_composite_functional_kernel). + + sig = DispatcherSignature(g.out.func) + target_sig = DispatcherSignature(g.functional.func) + + exprs = ", ".join( + [e.expr for e in translate(sig.arguments(), target_sig.arguments())] + ) + + copy_outs = [] + out_name = "tmp_output" + for i, out_arg in enumerate(g.out.func.arguments.out): + functional_return_name = ( + out_name + if len(g.functional.func.returns) == 1 + else f"std::get<{i}>({out_name})" + ) + copy_outs.append( + f"""\ + resize_out_helper({out_arg.name}, {functional_return_name}); + copy_arg({out_arg.name}, {functional_return_name});""" + ) + + rets = [] + # For each return arg in the calling (out=) operator, + # If it corresponds to an aliased input, return the input. + # Otherwise, return the corresponding output from calling the functional operator. + for i, ret_name in enumerate(g.out.func.aliased_return_names()): + if ret_name is not None: + rets.append(ret_name) + else: + functional_return_name = ( + out_name + if len(g.functional.func.returns) == 1 + else f"std::get<{i}>({out_name})" + ) + rets.append(functional_return_name) + + copy_outs_str = "\n".join(copy_outs) + + # Kernel name needs to follow the naming convention defined in `generate_function()` + return f""" +{sig.defn(name=g.out.func.name.unambiguous_name() + ("_symint" if g.out.func.has_symint() else ""))} {{ + auto {out_name} = at::_ops::{g.functional.func.name.unambiguous_name()}::call({exprs}); + {copy_outs_str} + {return_str(g.out.func.returns, rets)} +}} +""" diff --git a/torchgen/native_functions.yaml b/torchgen/native_functions.yaml new file mode 100644 index 00000000000..7970e17eb96 --- /dev/null +++ b/torchgen/native_functions.yaml @@ -0,0 +1,15622 @@ +# See README.md in this directory for more guidance + +# *********NB: _cast_* operators are DEPRECATED and will be removed +# eventually. These were previously used before TorchScript IR supported +# representing ScalarType's. They are now superseded by usage of +# `aten::to()`. The ops remain here for backward compatibility purposes. + +# DEPRECATED. DO NOT USE +- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# Computes the gradient of current tensor w.r.t. graph leaves. +- func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> () + manual_cpp_binding: True + variants: method + +# DEPRECATED. Sets the tensor data held by this `Variable` to be the same as +# `new_data`. It requires that `new_data` and `Variable` have compatible tensor +# type, by checking `_has_compatible_shallow_copy_type(this, new_data)`. +# +# This function is deprecated because it doesn't really make sense in a world +# where Variables *are* Tensors (as opposed to them containing tensors, which +# is what the previous interpretation was.) +- func: set_data(Tensor(a!) self, Tensor new_data) -> () + manual_cpp_binding: True + variants: method + +- func: data(Tensor self) -> Tensor + manual_cpp_binding: True + variants: method + +# True if this `Variable` is a leaf and thus does not have a `grad_fn`. +- func: is_leaf(Tensor self) -> bool + manual_cpp_binding: True + variants: method + +# Returns the output index of this variable from the forward operation that +# produced it. Conversely, it returns the input index of the gradient `Node` to +# which this `Variable` is connected (because in the gradient computation, +# inputs and outputs switch meaning). For example: +# +# y0, y1, y2 = f(x) +# assert y0.output_nr == 0 +# assert y1.output_nr == 1 +# assert y2.output_nr == 2 +# +- func: output_nr(Tensor self) -> int + manual_cpp_binding: True + variants: method + +- func: _version(Tensor self) -> int + manual_cpp_binding: True + variants: method + +- func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!) + manual_cpp_binding: True + variants: method + +# Enables .grad attribute for non-leaf Tensors. +- func: retain_grad(Tensor(a!) self) -> () + manual_cpp_binding: True + variants: method + +- func: retains_grad(Tensor self) -> bool + manual_cpp_binding: True + variants: method + +- func: _fw_primal(Tensor(a) self, int level) -> Tensor(a) + variants: method + dispatch: + CompositeExplicitAutograd: _fw_primal + +- func: _make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a) + variants: function + dispatch: + CompositeExplicitAutograd: _make_dual + +- func: _unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent) + variants: function + +# NOTE: [_new_zeros_with_same_feature_meta] +# This function creates a new tensor with the layout and TensorOptions +# of `other` but also takes into account the batch dimensions of `self` +# +# This function has a couple extra constraints because it is also used for `jvp` +# in functorch. +# - is used for forward AD because there is the restriction +# that the primal and tangent must have the same layout +# - We cannot assume that `self` and `other` have the same sizes or even dim +# because in the inplace over view case, `other` is the base tensor, and +# `self` is the forward grad with respect to the view, which can have an +# entirely different shape +# - takes the number of batch dims for `self` because we also handle +# some batching logic. We handle that here instead of a batching rule because +# we'd like to avoid calling as_strided in the batching rule (as to enable +# nested vmap in functorch). +# - needs to be CompositeExplicitAutograd for jvp support in functorch. +# functorch currently relies on TensorWrapper which does not have storage +# CompositeExplicitAutograd makes sure the TensorWrapper is unwrapped. +# - this function may eventually take on another int argument to store the +# the number of batch dims for other once we support that use case +- func: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _new_zeros_with_same_feature_meta + autogen: _new_zeros_with_same_feature_meta.out + +# This function compares the storage numel of self with that of other, where +# storage numel is computed as: `other.storage().nbytes() / other.itemsize()`. +# We create this function for composite compliance purposes. The batching rule +# always returns true because vmapped as_strided does not support accessing +# storage locations not indexable by the input tensor. +# See the note above for more information. +- func: _has_same_storage_numel(Tensor self, Tensor other) -> bool + variants: function + dispatch: + CompositeExplicitAutograd: _has_same_storage_numel + +- func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!) + variants: method + tags: inplace_view + +- func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a) + variants: method + +- func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a) + variants: method + +- func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a) + variants: method + +- func: align_as(Tensor self, Tensor other) -> Tensor + variants: method + +- func: align_tensors(Tensor[] tensors) -> Tensor[] + +# Not assert because it's a keyword; not Assert because FX already +# took that syntax +# TODO: need to specify this is side-effectful somehow +- func: _assert_async(Tensor self) -> () + dispatch: + CPU: _assert_async_cpu + CUDA: _assert_async_cuda + +- func: _assert_async.msg(Tensor self, str assert_msg) -> () + dispatch: + CPU: _assert_async_msg_cpu + CUDA: _assert_async_msg_cuda + +- func: _assert_scalar(Scalar self, str assert_msg) -> () + dispatch: + CompositeExplicitAutograd: _assert_scalar + +- func: _functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor + dispatch: + CompositeExplicitAutograd: _functional_assert_scalar + +- func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor + dispatch: + CPU: _functional_assert_async_msg_cpu + +- func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> () + +- func: _print(str s) -> () + dispatch: + CompositeExplicitAutograd: _print + +- func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> () + dispatch: + CompositeExplicitAutograd: sym_constrain_range + +- func: sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> () + dispatch: + CompositeExplicitAutograd: sym_constrain_range_for_size + +- func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor + dispatch: + CompositeExplicitAutograd: _functional_sym_constrain_range + +- func: _functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor + dispatch: + CompositeExplicitAutograd: _functional_sym_constrain_range_for_size + +- func: _make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + CPU: _make_dep_token_cpu + +- func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a) + variants: method + +- func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool + device_check: NoCheck # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss + dispatch: + CUDA: _use_cudnn_ctc_loss + +- func: _use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool + device_check: NoCheck # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss + dispatch: + CUDA: _use_cudnn_ctc_loss_tensor + +- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) + device_check: NoCheck # log_probs is expected to be on CUDA while targets is expected to be on CPU + dispatch: + CUDA: _cudnn_ctc_loss + autogen: _cudnn_ctc_loss.out + +- func: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) + device_check: NoCheck # log_probs is expected to be on CUDA while targets is expected to be on CPU + dispatch: + CUDA: _cudnn_ctc_loss_tensor + +- func: _use_cudnn_rnn_flatten_weight() -> bool + +- func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor + dispatch: + CUDA: _cudnn_rnn_flatten_weight + autogen: _cudnn_rnn_flatten_weight.out + +- func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + # rnn_tanh may or may not redispatch to _cudnn_rnn based on algorithm and build. Thus it might hit dispatch or kernel device check. + # Disable dispatch time device check for consistent behavior. + device_check: NoCheck + dispatch: + CUDA: _cudnn_rnn + autogen: _cudnn_rnn.out + tags: nondeterministic_seeded + +- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) + dispatch: + CUDA: _cudnn_rnn_backward + autogen: _cudnn_rnn_backward.out + +- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CUDA: _cudnn_init_dropout_state + autogen: _cudnn_init_dropout_state.out + tags: nondeterministic_seeded + +- func: _debug_has_internal_overlap(Tensor self) -> int + variants: function + +- func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: fused_dropout_cuda + tags: nondeterministic_seeded + autogen: _fused_dropout.out + +- func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor + variants: function + dispatch: + CUDA: masked_scale_cuda + autogen: _masked_scale.out + +- func: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: native_dropout_cpu + CUDA: native_dropout_cuda + NestedTensorCPU, NestedTensorCUDA: native_dropout_nested + tags: [nondeterministic_seeded, core] + autogen: native_dropout.out + +- func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor + dispatch: + CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward + CUDA: native_dropout_backward_cuda + autogen: native_dropout_backward.out + tags: pointwise + +- func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) + +- func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) + +- func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!) + +- func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!) + +- func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor + +- func: _shape_as_tensor(Tensor self) -> Tensor + +- func: dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: feature_dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: alpha_dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: abs(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: abs + SparseCPU, SparseCUDA: abs_sparse + SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs + tags: [core, pointwise] + +- func: abs_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: abs_ + SparseCPU, SparseCUDA: abs_sparse_ + SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_ + +- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: abs_out + MPS: abs_out_mps + SparseCPU, SparseCUDA: abs_sparse_out + SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out + tags: pointwise + +# Note [Adding an alias] +# To add an alias do the following: +# +# 1) Copy the original functions native_functions.yaml entry, but replace the +# original function's name with their own and delete any dispatch +# keys for the aliases. Specifying a dispatch key will prevent +# autograd from recording the operations the alias performs, which +# will stop it from "inheriting" the original operation's autograd behavior. +# 2) Implement the corresponding functions and have them redispatch to the +# original function. +# 3) Add docstrings to the new function that reference the original function, +# and document the method as usual (if it exists.) +# (See torch/_torch_docs.py and docs/source/torch.rst if adding a function, +# torch/_tensor_docs.py and docs/source/tensors.rst if adding a method, +# or module-specific doc bindings (like torch/linalg/__init__.py) if +# adding an alias in a namespace.) +# 4) Update torch/overrides.py consistent with the original function. +# 5) Update the alias_map in torch/csrc/jit/passes/normalize_ops.cpp. +# 6) Add aliases argument to existing OpInfo/UnaryUfuncInfo or create new OpInfo/UnaryUfuncInfo entry +# in op_db list in torch/testing/_internal/common_methods_invocations.py +# +# See torch.absolute, an alias for torch.abs, as an example. +# Absolute, alias for abs + +- func: absolute(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: absolute_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: angle(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: angle + SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr + tags: pointwise + +- func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: angle_out + SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out + tags: pointwise + +- func: view_as_real(Tensor(a) self) -> Tensor(a) + variants: function + dispatch: + CPU, CUDA, MPS, Meta: view_as_real + +- func: view_as_complex(Tensor(a) self) -> Tensor(a) + variants: function + dispatch: + CPU, CUDA, MPS, Meta: view_as_complex + +- func: sgn(Tensor self) -> Tensor + variants: function, method + structured_delegate: sgn.out + dispatch: + SparseCPU, SparseCUDA: sgn_sparse + SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn + tags: pointwise + +- func: sgn_(Tensor(a!) self) -> Tensor(a!) + variants: method + structured_delegate: sgn.out + dispatch: + SparseCPU, SparseCUDA: sgn_sparse_ + SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_ + tags: pointwise + +- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sgn_out + MPS: sgn_out_mps + SparseCPU, SparseCUDA: sgn_sparse_out + SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out + tags: pointwise + +- func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + variants: method + +- func: real(Tensor(a) self) -> Tensor(a) + device_check: NoCheck # TensorIterator + variants: function + +- func: imag(Tensor(a) self) -> Tensor(a) + device_check: NoCheck # TensorIterator + variants: function + +- func: _conj(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: _conj + +- func: conj(Tensor(a) self) -> Tensor(a) + variants: function, method + manual_cpp_binding: True + +- func: _conj_physical(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: _conj_physical + SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr + autogen: _conj_physical.out + +- func: conj_physical(Tensor self) -> Tensor + variants: function, method + tags: pointwise + +- func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: conj_physical_out + MPS: conj_physical_out_mps + SparseCPU, SparseCUDA: conj_physical_out_sparse + SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out + tags: pointwise + +- func: conj_physical_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: conj_physical_ + SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_ + tags: pointwise + +- func: resolve_conj(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: resolve_neg(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: _neg_view(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: _neg_view + +- func: acos(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: acos.out + tags: [core, pointwise] + +- func: acos_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: acos.out + tags: pointwise + +- func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: acos_out + MPS: acos_out_mps + tags: pointwise + +# arccos, alias of acos +- func: arccos(Tensor self) -> Tensor + variants: function, method + +- func: arccos_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor + tags: core + +- func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor + tags: core + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor) + +- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: add.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: add_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr + MkldnnCPU: mkldnn_add + ZeroTensor: add_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor + tags: [core, pointwise] + +- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: add.out + dispatch: + SparseCPU, SparseCUDA, SparseMeta: add_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_ + MkldnnCPU: mkldnn_add_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor + tags: pointwise + +- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + ufunc_inner_loop: + Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf) + ScalarOnly: add (Bool) + dispatch: + SparseCPU, SparseMeta: add_out_sparse_cpu + SparseCUDA: add_out_sparse_cuda + SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu + SparseCsrCUDA: add_out_sparse_compressed_cuda + MkldnnCPU: mkldnn_add_out + MPS: add_out_mps + tags: pointwise + +- func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_relu + +- func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_ + +- func: _add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_out + +- func: _add_relu.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_relu + +- func: _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_ + autogen: _add_relu.Scalar_out + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: add + tags: [core, pointwise] + +- func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: add_ + autogen: add.Scalar_out + tags: pointwise + +- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor + structured_delegate: addmv.out + variants: function, method + +- func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + structured_delegate: addmv.out + variants: function, method + +- func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmv_out_cpu + CUDA: addmv_out_cuda + MPS: addmv_out_mps + SparseCsrCPU: addmv_out_sparse_compressed + SparseCsrCUDA: addmv_out_sparse_compressed_cuda + +- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: addr + MPS: addr_mps + CompositeExplicitAutograd: math_addr + +- func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: addr_ + +- func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addr_out + MPS: addr_out_mps + CompositeExplicitAutograd: math_addr_out + +- func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: affine_grid_generator + autogen: affine_grid_generator.out + +- func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor + variants: function + +- func: _is_all_true(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: _is_all_true + +- func: _is_any_true(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: _is_any_true + +# Note: this function is only for testing. +- func: _test_check_tensor(Tensor self) -> Tensor + variants: function + +# Note; this function is only for testing +- func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor + variants: function + dispatch: + CPU: _test_functorch_fallback + autogen: _test_functorch_fallback.out + +- func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: all.out + variants: function, method + +- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: all.dims_out + variants: function, method + cpp_no_default_args: ['dim'] + dispatch: + CompositeExplicitAutograd: all_dims_default + +- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: all_out + MPS: all_out_mps + +- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: all_dims_out + CompositeExplicitAutograd: all_dims_out_default + cpp_no_default_args: ['dim'] + +- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool + variants: function, method + tags: data_dependent_output + dispatch: + CompositeExplicitAutograd: allclose + +- func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: any.out + variants: function, method + tags: core + +- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: any.dims_out + variants: function, method + cpp_no_default_args: ['dim'] + tags: core + dispatch: + CompositeExplicitAutograd: any_dims_default + +- func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: any_out + MPS: any_out_mps + +- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: any_dims_out + CompositeExplicitAutograd: any_dims_out_default + cpp_no_default_args: ['dim'] + +- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: arange + +- func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: arange + +# This operator should be named `arange.start_out` if following the naming convention. However that +# name is already taken. Disabled because of CI job failures. +# FIXME: enable this +#- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!) +# dispatch: +# CompositeExplicitAutograd: arange_start_out + +- func: arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: arange + cpp_no_default_args: ['step'] + tags: core + +- func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: arange_out + +- func: arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: arange_out + CUDA: arange_cuda_out + MPS: arange_mps_out + cpp_no_default_args: ['step'] + +# This function is a temporary hack to allow tracing of arange like constructs with dynamic +# bounds on arange. Normal arange is not traceable because it does not take any tensor inputs; +# if the range you need is based on another tensor, calling this function directly will +# preserve tracing. Get rid of this when arange can directly take tensors for bounds +# (so that it can be traced directly). +- func: _dim_arange(Tensor like, int dim) -> Tensor + +- func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor + structured_delegate: argmax.out + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + +- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: argmax_out + MPS: argmax_out_mps + +- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor + structured_delegate: argmin.out + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + +- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: argmin_out + MPS: argmin_out_mps + +- func: acosh(Tensor self) -> Tensor + variants: function, method + structured_delegate: acosh.out + tags: [core, pointwise] + +- func: acosh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + structured_delegate: acosh.out + tags: pointwise + +- func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: acosh_out + MPS: acosh_out_mps + tags: pointwise +# arccosh, alias for acosh + +- func: arccosh(Tensor self) -> Tensor + variants: function, method + +- func: arccosh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: asinh(Tensor self) -> Tensor + variants: function, method + structured_delegate: asinh.out + dispatch: + SparseCPU, SparseCUDA: asinh_sparse + SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr + tags: [core, pointwise] + +- func: asinh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + structured_delegate: asinh.out + dispatch: + SparseCPU, SparseCUDA: asinh_sparse_ + SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_ + tags: pointwise + +- func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: asinh_out + MPS: asinh_out_mps + SparseCPU, SparseCUDA: asinh_sparse_out + SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out + tags: pointwise + +# arcsinh, alias for asinh +- func: arcsinh(Tensor self) -> Tensor + variants: function, method + +- func: arcsinh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: atanh(Tensor self) -> Tensor + structured_delegate: atanh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atanh_sparse + SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr + tags: [core, pointwise] + +- func: atanh_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: atanh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atanh_sparse_ + SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_ + tags: pointwise + +- func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: atanh_out + MPS: atanh_out_mps + SparseCPU, SparseCUDA: atanh_sparse_out + SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out + tags: pointwise +# arctanh, alias for atanh + +- func: arctanh(Tensor self) -> Tensor + variants: function, method + +- func: arctanh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a) + variants: function, method + dispatch: + ZeroTensor, CPU, CUDA: as_strided_tensorimpl + Meta: as_strided_tensorimpl_meta_symint + MPS: as_strided_tensorimpl_mps + QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl + device_check: NoCheck + device_guard: False + tags: core + +- func: as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function, method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutogradNonFunctional: as_strided__symint + +- func: asin(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: asin.out + dispatch: + SparseCPU, SparseCUDA: asin_sparse + SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr + tags: [core, pointwise] + +- func: asin_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: asin.out + dispatch: + SparseCPU, SparseCUDA: asin_sparse_ + SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_ + tags: pointwise + +- func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: asin_out + MPS: asin_out_mps + SparseCPU, SparseCUDA: asin_sparse_out + SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out + tags: pointwise + +# arcsin, alias of asin +- func: arcsin(Tensor self) -> Tensor + variants: function, method + +- func: arcsin_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: atan(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: atan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atan_sparse + SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr + tags: [core, pointwise] + +- func: atan_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: atan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atan_sparse_ + SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_ + tags: pointwise + +- func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: atan_out + MPS: atan_out_mps + SparseCPU, SparseCUDA: atan_sparse_out + SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out + tags: pointwise + +# arctan, alias of atan +- func: arctan(Tensor self) -> Tensor + variants: function, method + +- func: arctan_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: atleast_1d(Tensor self) -> Tensor + variants: function + +- func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[] + +- func: atleast_2d(Tensor self) -> Tensor + variants: function + +- func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[] + variants: function + +- func: atleast_3d(Tensor self) -> Tensor + variants: function + +- func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[] + variants: function + +- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + structured_delegate: baddbmm.out + +- func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + variants: method + structured_delegate: baddbmm.out + +- func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU: baddbmm_out_cpu + CUDA: baddbmm_out_cuda + MPS: baddbmm_out_mps + SparseCsrCUDA: baddbmm_out_sparse_csr_cuda + +- func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: bartlett_window + autogen: bartlett_window.out + +- func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: bartlett_window + autogen: bartlett_window.periodic_out + +- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor + +- func: quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor + dispatch: + QuantizedCPU: quantized_batch_norm + autogen: quantized_batch_norm.out + +- func: _batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int) + +- func: _batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor) + +# Sample bernoulli with values in `self` as probability. +- func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: bernoulli + tags: nondeterministic_seeded + +- func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: bernoulli_out + MPS: bernoulli_out_mps + +- func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ + autogen: bernoulli.Tensor, bernoulli.Tensor_out + +- func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ + autogen: bernoulli.float_out + +# Note [bernoulli.p schema] +# We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking) +# This out-of-place version isn't used explicitly, but needed by jit. +# There is no default valid on `p` here because it would introduce ambiguity +# with `bernoulli(Tensor self, *, Generator? generator=None)` declaration. +- func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutogradNonFunctional: bernoulli + +- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor + +- func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_cpu + CUDA: binary_cross_entropy_cuda + MPS: binary_cross_entropy_mps + +- func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_out_cpu + CUDA: binary_cross_entropy_out_cuda + MPS: binary_cross_entropy_out_mps + +- func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_backward_cpu + CUDA: binary_cross_entropy_backward_cuda + MPS: binary_cross_entropy_backward_mps + +- func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_backward_out_cpu + CUDA: binary_cross_entropy_backward_out_cuda + MPS: binary_cross_entropy_backward_out_mps + +- func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: binary_cross_entropy_with_logits + autogen: binary_cross_entropy_with_logits.out + +- func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor + variants: function, method + dispatch: + CPU: _bincount_cpu + CUDA: _bincount_cuda + MPS: _bincount_mps + tags: dynamic_output_shape + autogen: bincount.out + +- func: bitwise_not(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: bitwise_not.out + variants: function, method + tags: [core, pointwise] + +- func: bitwise_not_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: bitwise_not.out + variants: method + tags: pointwise + +- func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_not_out + MPS: bitwise_not_out_mps + tags: pointwise + +- func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA, MPS: copysign_out + tags: pointwise + +- func: copysign.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: copysign.out + tags: pointwise + +- func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: copysign.out + +- func: copysign.Scalar(Tensor self, Scalar other) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: copysign + tags: pointwise + +- func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: copysign_ + +- func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: copysign_out + tags: pointwise + +- func: _lazy_clone(Tensor self) -> Tensor + # Like clone, but the copy takes place lazily, only if either the + # input or the output are written. + variants: function, method + dispatch: + CompositeExplicitAutograd: _lazy_clone + +- func: logical_not(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_not + NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not + tags: [core, pointwise] + +- func: logical_not_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_not_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_ + tags: pointwise + +- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_not_out + MPS: logical_not_out_mps + tags: pointwise + +- func: logical_xor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_xor + tags: [core, pointwise] + +- func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_xor_ + tags: pointwise + +- func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_xor_out + MPS: logical_xor_out_mps + tags: pointwise + +- func: logical_and(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_and + tags: [core, pointwise] + +- func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_and_ + tags: pointwise + +- func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_and_out + MPS: logical_and_out_mps + tags: pointwise + +- func: logical_or(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_or + tags: [core, pointwise] + +- func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_or_ + tags: pointwise + +- func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_or_out + MPS: logical_or_out_mps + tags: pointwise + +- func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: blackman_window + autogen: blackman_window.out + +- func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: blackman_window + autogen: blackman_window.periodic_out + +- func: bmm(Tensor self, Tensor mat2) -> Tensor + structured_delegate: bmm.out + variants: function, method + dispatch: + SparseCPU: bmm_sparse_cpu + SparseCUDA: bmm_sparse_cuda + NestedTensorCPU: bmm_nested + NestedTensorCUDA: bmm_nested_cuda + tags: core + +- func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU: bmm_out_cpu + CUDA: bmm_out_cuda + MPS: bmm_out_mps + SparseCPU: bmm_out_sparse_cpu + SparseCUDA: bmm_out_sparse_cuda + SparseCsrCUDA: bmm_out_sparse_csr_cuda + +- func: broadcast_tensors(Tensor[] tensors) -> Tensor[] + device_check: NoCheck + device_guard: False + +- func: broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a) + variants: function, method + dispatch: + CompositeImplicitAutograd: broadcast_to_symint + +- func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) + variants: function + dispatch: + SparseCPU, SparseCUDA: sparse_broadcast_to + +- func: cat(Tensor[] tensors, int dim=0) -> Tensor + structured_delegate: cat.out + dispatch: + SparseCPU, SparseCUDA: cat_sparse + QuantizedCPU: cat_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: cat_nested + tags: core + +- func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + precomputed: + - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format + dispatch: + CPU: cat_out_cpu + CUDA: cat_out_cuda + MPS: cat_out_mps + QuantizedCPU: cat_out_quantized_cpu + +- func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +# alias for torch.cat +- func: concat(Tensor[] tensors, int dim=0) -> Tensor + +- func: concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: concat.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +# alias for torch.cat +- func: concatenate(Tensor[] tensors, int dim=0) -> Tensor + +- func: concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +- func: block_diag(Tensor[] tensors) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: block_diag + autogen: block_diag.out + +- func: ceil(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: ceil.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: ceil_sparse + SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr + tags: [core, pointwise] + +- func: ceil_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: ceil.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: ceil_sparse_ + SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_ + tags: pointwise + +- func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: ceil_out + MPS: ceil_out_mps + SparseCPU, SparseCUDA: ceil_sparse_out + SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out + tags: pointwise + +# alias for torch.linalg.multi_dot +- func: chain_matmul(Tensor[] matrices) -> Tensor + variants: function + +# alias for torch.linalg.multi_dot +- func: chain_matmul.out(Tensor[] matrices, *, Tensor(a!) out) -> Tensor(a!) + +- func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[] + variants: function, method + device_check: NoCheck + device_guard: False + +- func: chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: chunk + NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor + +- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[] + variants: function, method + dispatch: + CompositeImplicitAutograd: tensor_split_sections_symint + +- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[] + variants: function, method + dispatch: + CompositeImplicitAutograd: tensor_split_indices_symint + +- func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[] + variants: function, method + +- func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ['min'] + structured_delegate: clamp.out + dispatch: + QuantizedCPU: clamp_quantized_cpu + tags: [core, pointwise] + +- func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor + variants: function, method + structured_delegate: clamp.Tensor_out + tags: [core, pointwise] + +- func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ['min'] + structured_delegate: clamp.out + tags: pointwise + +- func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!) + variants: function, method + structured_delegate: clamp.Tensor_out + tags: pointwise + +- func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ['min'] + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_out + MPS: clamp_out_mps + tags: pointwise + +- func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_Tensor_out + MPS: clamp_Tensor_out_mps + tags: pointwise + +- func: clamp_max(Tensor self, Scalar max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_max.out + tags: pointwise + +- func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor + variants: function, method + structured_delegate: clamp_max.Tensor_out + tags: pointwise + +- func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_max.out + tags: pointwise + +- func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!) + variants: function, method + structured_delegate: clamp_max.Tensor_out + tags: pointwise + +- func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_max_out + MPS: clamp_max_out_mps + tags: pointwise + +- func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_max_Tensor_out + MPS: clamp_max_Tensor_out_mps + tags: pointwise + +- func: clamp_min(Tensor self, Scalar min) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_min.out + tags: pointwise + +- func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor + variants: function, method + structured_delegate: clamp_min.Tensor_out + tags: pointwise + +- func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_min.out + tags: pointwise + +- func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!) + variants: function, method + structured_delegate: clamp_min.Tensor_out + tags: pointwise + +- func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_min_out + MPS: clamp_min_out_mps + tags: pointwise + +- func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_min_Tensor_out + MPS: clamp_min_Tensor_out_mps + tags: pointwise + +# clip is an alias for clamp +- func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor + cpp_no_default_args: ['min'] + variants: function, method + tags: pointwise + +- func: clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor + variants: function, method + tags: pointwise + +- func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) + cpp_no_default_args: ['min'] + variants: function, method + tags: pointwise + +- func: clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!) + variants: function, method + tags: pointwise + +- func: clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + cpp_no_default_args: ['min'] + tags: pointwise + +- func: clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!) + +- func: cudnn_is_acceptable(Tensor self) -> bool + device_check: NoCheck + device_guard: False + +- func: complex(Tensor real, Tensor imag) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: complex + +- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: complex_out + MPS: complex_out_mps + +- func: polar(Tensor abs, Tensor angle) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: polar + +- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polar_out + MPS: polar_out_mps + +- func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: constant_pad_nd + MPS: constant_pad_nd_mps + autogen: constant_pad_nd.out + tags: core + +- func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) + variants: method + manual_cpp_binding: True + +- func: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor + dispatch: + CompositeExplicitAutograd: convolution + autogen: convolution.out + tags: core + +- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd, CUDA: convolution_backward + autogen: convolution_backward.out + tags: core + +- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor + dispatch: + CompositeExplicitAutograd: convolution_overrideable + autogen: convolution_overrideable.out + +- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) + dispatch: + CompositeExplicitAutograd: convolution_backward_overrideable + autogen: convolution_backward_overrideable.out + +- func: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor + dispatch: + CompositeExplicitAutograd: _convolution + autogen: _convolution.out + +- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor + +- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CompositeImplicitAutograd: _convolution_mode_symint + +- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + +- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv1d_symint + +- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv2d_symint + +- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv3d_symint + +- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor + cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv1d_padding_symint + +- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor + cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv2d_padding_symint + +- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor + cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv3d_padding_symint + +- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor + dispatch: + CompositeExplicitAutograd: conv_tbc + autogen: conv_tbc.out + +- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor) + +# NB: we inherit the goofy argument order from PyTorch torch.nn.functional +- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose1d_symint + +- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose2d_symint + +- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose3d_symint + +- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor + variants: function + dispatch: + Meta: copy_meta + CompositeExplicitAutogradNonFunctional: copy + tags: core + +- func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: copy_mkldnn_ + SparseCPU, SparseCUDA: copy_sparse_wrapper_ + CompositeExplicitAutograd: copy_ + SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_ + NestedTensorCPU, NestedTensorCUDA: copy_nested_ + autogen: copy.out + +- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor + dispatch: + MPS: _copy_from_mps + autogen: _copy_from.out + +# We need this to be able to properly copy from a CPU to an XLA tensor with different sizes. +# See https://github.com/pytorch/xla/issues/2881 +- func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor + dispatch: + MPS: _copy_from_and_resize_mps + autogen: _copy_from_and_resize.out + +- func: cos(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cos.out + dispatch: + NestedTensorCPU, NestedTensorCUDA: cos_nested + tags: [core, pointwise] + +- func: cos_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cos.out + tags: pointwise + +- func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: cos_out + MPS: cos_out_mps + tags: pointwise + +- func: cosh(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cosh.out + tags: [core, pointwise] + +- func: cosh_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cosh.out + tags: pointwise + +- func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: cosh_out + MPS: cosh_out_mps + tags: pointwise + +- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor + +- func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor + variants: function, method + dispatch: + CPU: count_nonzero_cpu + CUDA: count_nonzero_cuda + MPS: count_nonzero_mps + autogen: count_nonzero.dim_IntList_out + +- func: count_nonzero(Tensor self, int? dim=None) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: count_nonzero + autogen: count_nonzero.out + +- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor + variants: function, method + +- func: corrcoef(Tensor self) -> Tensor + variants: function, method + +- func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid + dispatch: + CUDA: cudnn_affine_grid_generator_forward + autogen: cudnn_affine_grid_generator.out + +# TODO: Why do I have to call this grad?! +- func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta + dispatch: + CUDA: cudnn_affine_grid_generator_backward + autogen: cudnn_affine_grid_generator_backward.out + +- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: cudnn_batch_norm + autogen: cudnn_batch_norm.out + +# NB: You can only use this if you used cudnn_batch_norm training=True +- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: cudnn_batch_norm_backward + autogen: cudnn_batch_norm_backward.out + +- func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor + dispatch: + CUDA: cudnn_convolution + +- func: cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CUDA: cudnn_convolution_out + +- func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor + dispatch: + CUDA: cudnn_convolution_transpose + autogen: cudnn_convolution_transpose.out + +- func: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + MPS: _mps_convolution_transpose + autogen: _mps_convolution_transpose.out + +- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + MPS: mps_convolution_transpose_backward + autogen: mps_convolution_transpose_backward.out + +- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CUDA: cudnn_convolution_relu + autogen: cudnn_convolution_relu.out + +- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CUDA: cudnn_convolution_add_relu + autogen: cudnn_convolution_add_relu.out + +# NB: input is special cased in a way I don't quite understand +- func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output + dispatch: + CUDA: cudnn_grid_sampler_forward + autogen: cudnn_grid_sampler.out + +- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid) + dispatch: + CUDA: cudnn_grid_sampler_backward + autogen: cudnn_grid_sampler_backward.out + +- func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: cummax + +- func: cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: cummax_out + +- func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: _cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> () + variants: function + dispatch: + CPU: cummax_helper_cpu + CUDA: cummax_helper_cuda + +- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: cummin + +- func: cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: cummin_out + +- func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: _cummin_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> () + variants: function + dispatch: + CPU: cummin_helper_cpu + CUDA: cummin_helper_cuda + +- func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + structured_delegate: cumprod.out + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) + structured_delegate: cumprod.out + variants: method + +- func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: cumprod_out + MPS: cumprod_out_mps + +- func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) + variants: method + +- func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: cumprod_backward(Tensor grad, Tensor input, int dim, Tensor output) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + structured_delegate: cumsum.out + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + +- func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) + structured_delegate: cumsum.out + variants: method + +- func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: cumsum_out + MPS: cumsum_out_mps + +- func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) + variants: method + +- func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: cumulative_trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: cumulative_trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor + +- func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor + +# convenience function that converts to intlists for you +- func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor + +- func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) + dispatch: + CPU: ctc_loss_cpu + CUDA: ctc_loss_gpu + Meta: ctc_loss_meta + autogen: _ctc_loss.out + tags: dynamic_output_shape # the shape of second output is data dependent + +- func: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: ctc_loss_tensor + autogen: _ctc_loss.Tensor_out + tags: dynamic_output_shape # the shape of second output is data dependent + +- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor + dispatch: + CPU: ctc_loss_backward_cpu + CUDA: ctc_loss_backward_gpu + autogen: _ctc_loss_backward.out + +- func: _ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor + dispatch: + CPU, CUDA: ctc_loss_backward_tensor + +- func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutogradNonFunctional: diag_embed + autogen: diag_embed.out + +- func: diagflat(Tensor self, int offset=0) -> Tensor + variants: function, method + +- func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: diagonal + tags: core + +- func: linalg_diagonal(Tensor(a) A, *, int offset=0, int dim1=-2, int dim2=-1) -> Tensor(a) + python_module: linalg + variants: function + +- func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a) + variants: function, method + +- func: diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: diagonal_backward_symint + autogen: diagonal_backward.out + +- func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!) + variants: method + +- func: diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor + variants: function, method + +- func: diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +- func: gradient.scalarint(Tensor self, *, Scalar? spacing=None, int? dim=None, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.scalararray(Tensor self, *, Scalar spacing, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.array(Tensor self, *, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.scalarrayarray(Tensor self, *, Scalar[] spacing, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.tensorarrayint(Tensor self, *, Tensor[] spacing, int? dim=None, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.tensorarray(Tensor self, *, Tensor[] spacing, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: div.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: div.out + dispatch: + SparseCPU, SparseCUDA: div_sparse + ZeroTensor: div_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor + tags: [core, pointwise] + +- func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: div.out + dispatch: + SparseCPU, SparseCUDA: div_sparse_ + tags: pointwise + +- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: div_out + MPS: div_out_mps + SparseCPU, SparseCUDA: div_out_sparse_zerodim + tags: pointwise + +- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: div.out_mode + dispatch: + SparseCPU, SparseCUDA: div_sparse + tags: [core, pointwise] + +- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: div.out_mode + dispatch: + SparseCPU, SparseCUDA: div_sparse_ + tags: pointwise + +- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: div_out_mode + MPS: div_out_mode_mps + SparseCPU, SparseCUDA: div_out_sparse_zerodim + tags: pointwise + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: div.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: div + NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar + tags: [core, pointwise] + +- func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: div_ + autogen: div.Scalar_out + tags: pointwise + +- func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: div + tags: [core, pointwise] + +- func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: div_ + autogen: div.Scalar_mode_out + tags: pointwise + +# divide, alias for div +- func: divide.Tensor(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: divide.Scalar(Tensor self, Scalar other) -> Tensor + variants: function, method + +- func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor + variants: function, method + +- func: divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!) + variants: method + +- func: divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) + +- func: divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor + variants: function, method + +- func: divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!) + variants: method + + # true_divide, an alias for div +- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: pointwise + +- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: dot(Tensor self, Tensor tensor) -> Tensor + variants: function, method + dispatch: + CPU: dot + CUDA: dot_cuda + MPS: dot_mps + +- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: dot_out + +- func: vdot(Tensor self, Tensor other) -> Tensor + variants: function, method + dispatch: + CPU: vdot + CUDA: vdot_cuda + +- func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: vdot_out + +- func: einsum(str equation, Tensor[] tensors, *, int[]? path=None) -> Tensor + +- func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor + dispatch: + CompositeExplicitAutograd: embedding_symint + NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding + autogen: embedding.out + tags: core + +- func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor + dispatch: + CompositeImplicitAutograd: embedding_backward_symint + +- func: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor + dispatch: + CPU: embedding_dense_backward_cpu + CUDA: embedding_dense_backward_cuda + MPS: embedding_dense_backward_mps + autogen: embedding_dense_backward.out + tags: core + +- func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) + dispatch: + CPU: embedding_renorm_cpu_ + CUDA: embedding_renorm_cuda_ + autogen: embedding_renorm, embedding_renorm.out + +- func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor + +# NOTE [ embedding_bag Native Functions ] +# The `_embedding_bag.*` variants assume that input tensors except for `weight`, +# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous. +# We really only need to enforce this for `_embedding_bag` (the forward) because +# the backward inputs are the same as forward ones. +# The above `embedding_bag` wrapper is created to achieve this, e.g., +# applying indices = indices.contiguous(). +# The backward functions apply a check that these input tensors are contiguous. + + +- func: _embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: _embedding_bag_forward_only_cpu + CUDA: _embedding_bag_forward_only_cuda + autogen: _embedding_bag_forward_only.out + +- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor) + +# row_stack is the alias of vstack +- func: row_stack(Tensor[] tensors) -> Tensor + +- func: row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor) + +# To keep backward and forward compatibility, and to avoid ambiguity with the +# original signature above, scale_grad_by_freq, mode, sparse, +# per_sample_weights, and include_last_offset parameters do not have default +# values. Once the original signature is removed, default values can be added. +- func: embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor, Tensor) + +- func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: _embedding_bag_cpu + CUDA: _embedding_bag_cuda + autogen: _embedding_bag.out + tags: core + +- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor + dispatch: + CompositeImplicitAutograd: _embedding_bag_backward_symint + +- func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor + dispatch: + CompositeImplicitAutograd: _embedding_bag_sparse_backward_symint + +- func: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor + dispatch: + CPU: _embedding_bag_dense_backward_cpu + CUDA: _embedding_bag_dense_backward_cuda + autogen: _embedding_bag_dense_backward.out + +- func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor + dispatch: + CPU: _embedding_bag_per_sample_weights_backward_cpu + CUDA: _embedding_bag_per_sample_weights_backward_cuda + autogen: _embedding_bag_per_sample_weights_backward.out + +- func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: empty_names + autogen: empty.names_out + +- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + CPU: empty_cpu + CUDA: empty_cuda + MPS: empty_mps + Meta: empty_meta_symint + MkldnnCPU: empty_mkldnn + SparseCPU, SparseCUDA, SparseMeta: empty_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed + QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized + tags: core + +- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: empty_permuted_symint + autogen: empty_permuted.out + +# We do not make new_empty a composite that calls into new_empty_strided, as the strided version +# is significantly more difficult to implement by different backends +- func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + CompositeExplicitAutograd: new_empty_symint + autogen: new_empty.out + +- func: new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + CompositeExplicitAutogradNonFunctional: new_empty_strided_symint + autogen: new_empty_strided.out + +- func: new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: new_full + autogen: new_full.out + +- func: new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: new_zeros + autogen: new_zeros.out + +- func: new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: new_ones + autogen: new_ones.out + +# other overrides are to provide a more helpful error message that dtype is required +- func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor + dispatch: + CPU: empty_affine_quantized_other_backends_stub + QuantizedCPU, QuantizedCUDA: empty_affine_quantized + autogen: _empty_affine_quantized.out + +# it's a factory function receiving a tensor argument, thus overriding explicitly +# other overrides are to provide a more helpful error message that dtype is required +- func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor + category_override: factory + dispatch: + CPU: empty_per_channel_affine_quantized_other_backends_stub + QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized + autogen: _empty_per_channel_affine_quantized.out + +- func: resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: method + device_check: NoCheck + device_guard: False + tags: [core, inplace_view] + dispatch: + Meta: resize__symint + CPU: resize_ + CUDA: resize_cuda_ + MPS: resize_mps_ + QuantizedCPU: quantized_resize_cpu_ + SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_ + autogen: resize, resize.out + +# This is a utility function to enable users to resize out tensor while registering kernels for out variants. +# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration +# to make it easy to register out variants for ops. +- func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function + dispatch: + Meta: _resize_output_ + autogen: _resize_output, _resize_output.out + +- func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + category_override: factory + variants: function + dispatch: + QuantizedCPU, QuantizedCUDA: empty_quantized + autogen: empty_quantized.out + +- func: empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + device_guard: False + +- func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: empty_like + QuantizedCPU, QuantizedCUDA: empty_like_quantized + SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr + NestedTensorCPU, NestedTensorCUDA: empty_like_nested + autogen: empty_like.out + +- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: empty_strided_cpu + CUDA: empty_strided_cuda + MPS: empty_strided_mps + Meta: empty_strided_meta_symint + QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized + autogen: empty_strided.out + tags: core + +- func: erf(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: erf.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: erf_sparse + SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr + tags: [core, pointwise] + +- func: erf_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: erf.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: erf_sparse_ + SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_ + tags: pointwise + +- func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: erf_out + MPS: erf_out_mps + SparseCPU, SparseCUDA: erf_sparse_out + SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out + tags: pointwise + +- func: erfc(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: erfc.out + variants: function, method + tags: pointwise + +- func: erfc_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: erfc.out + variants: function, method + tags: pointwise + +- func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: erfc_out + tags: pointwise + +- func: exp(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: exp.out + variants: function, method + tags: [core, pointwise] + +- func: exp_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: exp.out + variants: function, method + tags: pointwise + +- func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: exp_out + MPS: exp_out_mps + tags: pointwise + +- func: exp2(Tensor self) -> Tensor + structured_delegate: exp2.out + variants: function, method + tags: pointwise + +- func: exp2_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: exp2.out + variants: function, method + tags: pointwise + +- func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: exp2_out + MPS: exp2_out_mps + tags: pointwise + +- func: expm1(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: expm1.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: expm1_sparse + SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr + tags: [core, pointwise] + +- func: expm1_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: expm1.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: expm1_sparse_ + SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_ + tags: pointwise + +- func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: expm1_out + MPS: expm1_out_mps + SparseCPU, SparseCUDA: expm1_sparse_out + SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out + tags: pointwise + +- func: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a) + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: expand + tags: core + +- func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a) + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + device_check: NoCheck + device_guard: False + +# decomposes to eye.m +- func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: eye + +- func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: eye + +- func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: eye_out_cpu + CUDA: eye_out_cuda + MPS: eye_out_mps + +- func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: eye_out_cpu + CUDA: eye_out_cuda + MPS: eye_out_mps + +- func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) + variants: function, method + +- func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) + variants: function, method + +- func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a) + variants: function, method + +- func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) + variants: function, method + +- func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a) + variants: function, method + dispatch: + CompositeImplicitAutograd: unflatten_symint + +- func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a) + variants: function, method + dispatch: + CompositeImplicitAutograd: unflatten_dimname_symint + +- func: fill.Scalar(Tensor self, Scalar value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + tags: core + +- func: fill.Tensor(Tensor self, Tensor value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + +- func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: fill_ + MPS: fill_scalar_mps + QuantizedCPU, QuantizedCUDA: fill_quantized_ + Meta: fill_meta_ + SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: fill_nested_ + autogen: fill.Scalar_out + +- func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: fill_ + MPS: fill_tensor_mps_ + QuantizedCPU, QuantizedCUDA: fill_quantized_ + Meta: fill_meta_ + NestedTensorCPU, NestedTensorCUDA: fill_nested_ + autogen: fill.Tensor_out + +- func: floor(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: floor.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: floor_sparse + SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr + tags: [core, pointwise] + +- func: floor_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: floor.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: floor_sparse_ + SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_ + tags: pointwise + +- func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: floor_out + MPS: floor_out_mps + SparseCPU, SparseCUDA: floor_sparse_out + SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out + tags: pointwise + +- func: floor_divide(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: floor_divide + MPS: floor_divide_mps + SparseCPU, SparseCUDA: floor_divide_sparse + +- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: floor_divide_ + MPS: floor_divide_mps_ + SparseCPU, SparseCUDA: floor_divide_sparse_ + +- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: floor_divide_out + MPS: floor_divide_out_mps + SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim + +- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: floor_divide + +- func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: floor_divide_ + autogen: floor_divide.Scalar_out + +- func: frac(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: frac.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: frac_sparse + SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr + tags: pointwise + +- func: frac_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: frac.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: frac_sparse_ + SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_ + tags: pointwise + +- func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: frac_out + MPS: frac_out_mps + SparseCPU, SparseCUDA: frac_sparse_out + SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out + tags: pointwise + +- func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: full + autogen: full.names_out + +- func: full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: full + tags: core + +- func: full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: full_out + +- func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: full_like + autogen: full_like.out + +- func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: from_file + autogen: from_file.out + +- func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: gcd_out + tags: pointwise + +- func: gcd(Tensor self, Tensor other) -> Tensor + structured_delegate: gcd.out + variants: function, method + tags: pointwise + +- func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: gcd.out + variants: function, method + +- func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lcm_out + tags: pointwise + +- func: lcm(Tensor self, Tensor other) -> Tensor + structured_delegate: lcm.out + variants: function, method + tags: pointwise + +- func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: lcm.out + variants: function, method + +# NOTE [ grid_sampler Native Functions ] +# `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to +# one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of +# which has the corresponding backward defined as native functions as well. +# However, we do shape checking everywhere for now since each of the mentioned +# functions can be called directly, which will lead to crashes otherwise. +# See https://github.com/pytorch/pytorch/issues/73187 for more information. +# +# There is also _grid_sampler_2d_backward_cpu_fallback which is an +# implementation detail of grid_sampler_2d and is only exposed here for testing +# purposes. +# +# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to +# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in +# `interpolation_mode` because it only supports Bilinear interpolation mode. +# Nor does it take in `align_corners` because it only supports the mode +# `align_corners = True`. +- func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + +- func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + dispatch: + CPU, QuantizedCPU: grid_sampler_2d_cpu + CUDA: grid_sampler_2d_cuda + MPS: grid_sampler_2d_mps + autogen: grid_sampler_2d.out + tags: core + +# `grid_sampler_2d_backward` takes in `output_mask` to optimize performance for +# the case where `input` doesn't require gradient. Gradient for `grid` is always +# computed (only `output_mask[0]` is checked by the implementations). +- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + CPU: grid_sampler_2d_backward_cpu + CUDA: grid_sampler_2d_backward_cuda + autogen: grid_sampler_2d_backward.out + +# See NOTE [ grid_sample CPU fallback ] +- func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + dispatch: + CompositeExplicitAutograd: _grid_sampler_2d_cpu_fallback + autogen: _grid_sampler_2d_cpu_fallback.out + +- func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) + +- func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + dispatch: + CPU: grid_sampler_3d_cpu + CUDA: grid_sampler_3d_cuda + autogen: grid_sampler_3d.out + +# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for +# the case where `input` doesn't require gradient. Gradient for `grid` is always +# computed (only `output_mask[0]` is checked by the implementations). +- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + CPU: grid_sampler_3d_backward_cpu + CUDA: grid_sampler_3d_backward_cuda + autogen: grid_sampler_3d_backward.out + +- func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hann_window + autogen: hann_window.out + +- func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hann_window + autogen: hann_window.periodic_out + +- func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.out + +- func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.periodic_out + +- func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.periodic_alpha_out + +- func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.periodic_alpha_beta_out + +- func: kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: kaiser_window + autogen: kaiser_window.out + +- func: kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: kaiser_window + autogen: kaiser_window.periodic_out + +- func: kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: kaiser_window + autogen: kaiser_window.beta_out + +- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor + +- func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor + +- func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, CUDA: native_group_norm + CompositeExplicitAutograd: math_group_norm + autogen: native_group_norm.out + tags: core + +- func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, CUDA: native_group_norm_backward + autogen: native_group_norm_backward.out + tags: core + +# Real to complex forward FFT +- func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor + variants: function + dispatch: + CPU: _fft_r2c_mkl + CUDA: _fft_r2c_cufft + MPS: _fft_r2c_mps + +- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: _fft_r2c_mkl_out + CUDA: _fft_r2c_cufft_out + MPS: _fft_r2c_mps_out + +# Complex to real inverse FFT +- func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor + variants: function + dispatch: + CPU: _fft_c2r_mkl + CUDA: _fft_c2r_cufft + MPS: _fft_c2r_mps + +- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: _fft_c2r_mkl_out + CUDA: _fft_c2r_cufft_out + MPS: _fft_c2r_mps_out + +# Standard complex to complex FFT (forward or backward) +- func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor + variants: function + dispatch: + CPU: _fft_c2c_mkl + CUDA: _fft_c2c_cufft + MPS: _fft_c2c_mps + +- func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: _fft_c2c_mkl_out + CUDA: _fft_c2c_cufft_out + MPS: _fft_c2c_mps_out + +- func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () + device_check: NoCheck + variants: function + dispatch: + CPU: _validate_compressed_sparse_indices_cpu + CUDA: _validate_compressed_sparse_indices_cuda + +- func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int + +- func: _cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int + +- func: _cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> () + +- func: _cufft_clear_plan_cache(DeviceIndex device_index) -> () + +- func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: index.Tensor_out + variants: function, method + dispatch: + QuantizedCPU: quantized_index + tags: [core, dynamic_output_shape] + # NB: This function is special-cased in tools/autograd/gen_variable_type.py + # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: + # - Tensor Tensor::index(ArrayRef indices) + # - Tensor Tensor::index(std::initializer_list indices) + +- func: index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True + structured_inherits: TensorIteratorBase + precomputed: + - indices -> DimVector sizes, DimVector strides + dispatch: + CPU, CUDA, MPS: index_out + +# Used by inductor to signal indexing without bounds checks +# Note that we don't support boolean indexing, to avoid dynamic output shapes +- func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _unsafe_index + +- func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: index_copy_out + +- func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) + variants: method + structured_delegate: index_copy.out + +- func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor + variants: function, method + structured_delegate: index_copy.out + +- func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!) + variants: method + +- func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor + variants: function, method + +- func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!) + device_check: NoCheck # delegate to _index_put_impl_, which leverages TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_put_ + autogen: index_put.out + # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: + # - Tensor & Tensor::index_put_(ArrayRef indices, Tensor const & rhs) + # - Tensor & Tensor::index_put_(ArrayRef indices, Scalar v) + # - Tensor & Tensor::index_put_(std::initializer_list indices, Tensor const & rhs) + # - Tensor & Tensor::index_put_(std::initializer_list indices, Scalar v) + +- func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor + device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_put + tags: core + +- func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor + device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: _unsafe_index_put + +- func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA, MPS: _index_put_impl_ + QuantizedCPU: _index_put_impl_quantized_cpu_ + QuantizedCUDA: _index_put_impl_quantized_cuda_ + autogen: _index_put_impl, _index_put_impl.out + +- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor + variants: function + +- func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor + variants: function, method + +- func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Tensor_Tensor_out + MPS: isin_Tensor_Tensor_out_mps + +- func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Tensor_Tensor_out + +- func: isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Tensor_Scalar_out + +- func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Tensor_Scalar_out + +- func: isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Scalar_Tensor_out + +- func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Scalar_Tensor_out + +- func: isnan(Tensor self) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, MPS: isnan + SparseCPU, SparseCUDA: isnan_sparse + SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr + autogen: isnan.out + tags: [core, pointwise] + +- func: is_distributed(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + +- func: is_floating_point(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: is_complex(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: is_conj(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + +- func: _is_zerotensor(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + +- func: is_neg(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + +- func: isreal(Tensor self) -> Tensor + variants: function, method + +- func: is_nonzero(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + +- func: is_same_size(Tensor self, Tensor other) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + NestedTensorCPU, NestedTensorCUDA: nested_is_same_size + CompositeExplicitAutograd: is_same_size + +- func: is_signed(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: is_inference(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor + +- func: kron(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CompositeExplicitAutograd: kthvalue + +- func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU: kthvalue_out_cpu + CUDA: kthvalue_out_cuda + +- func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor + dispatch: + CompositeImplicitAutograd: layer_norm_symint + +- func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: layer_norm_cpu + CUDA: layer_norm_cuda + MPS: layer_norm_mps + CompositeExplicitAutograd: math_native_layer_norm + NestedTensorCPU, NestedTensorCUDA: nested_layer_norm + autogen: native_layer_norm.out + tags: core + +- func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: layer_norm_backward_cpu + CUDA: layer_norm_backward_cuda + MPS: layer_norm_backward_mps + NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested + autogen: native_layer_norm_backward.out + tags: core + +- func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor + +- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: nan_to_num + SparseCPU, SparseCUDA: nan_to_num_sparse + tags: pointwise + +- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: nan_to_num_ + SparseCPU, SparseCUDA: nan_to_num_sparse_ + tags: pointwise + +- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nan_to_num_out + MPS: nan_to_num_out_mps + SparseCPU, SparseCUDA: nan_to_num_sparse_out + tags: pointwise + +- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: linear + NestedTensorCPU, NestedTensorCUDA: nested_linear + MPS: _mps_linear + +- func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + NestedTensorCPU, NestedTensorCUDA: nested_linear_backward + MPS: mps_linear_backward + autogen: linear_backward.out + +- func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CompositeExplicitAutograd: linear_out + +- func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor + python_module: nn + dispatch: + MkldnnCPU: mkldnn_linear + autogen: mkldnn_linear.out + +- func: mkldnn_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor + dispatch: + MkldnnCPU: mkldnn_linear_backward_input + autogen: mkldnn_linear_backward_input.out + +- func: mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor) + dispatch: + MkldnnCPU: mkldnn_linear_backward_weights + autogen: mkldnn_linear_backward_weights.out + +- func: mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MkldnnCPU: mkldnn_linear_backward + autogen: mkldnn_linear_backward.out + +- func: _cslt_compress(Tensor input) -> Tensor + dispatch: + CUDA: _cslt_compress + +- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor + dispatch: + CUDA: _cslt_sparse_mm + +- func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int + dispatch: + CUDA: _cslt_sparse_mm_search + +- func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: _sparse_semi_structured_tile + +- func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor) + dispatch: + CUDA: _sparse_semi_structured_apply + +- func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_apply_dense + +# DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead +- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_linear + +- func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_mm + +- func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_addmm + +- func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor + dispatch: + CUDA: _mixed_dtypes_linear + +- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor + +- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor + +- func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) + +- func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor + +- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor + +- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor + +- func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor + +- func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor + +- func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: function, method + tags: pointwise + +- func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: linspace + +- func: linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace + +- func: linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace + +- func: linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace + +- func: linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: linspace_out + CUDA: linspace_cuda_out + MPS: linspace_out_mps + +- func: linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace_out + +- func: linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace_out + +- func: linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace_out + +- func: log(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log.out + variants: function, method + tags: [core, pointwise] + +- func: log_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log.out + variants: function, method + tags: pointwise + +- func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log_out + MPS: log_out_mps + tags: pointwise + +- func: log10(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log10.out + variants: function, method + tags: [core, pointwise] + +- func: log10_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log10.out + variants: function, method + tags: pointwise + +- func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log10_out + MPS: log10_out_mps + tags: pointwise + +- func: log1p(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log1p.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: log1p_sparse + SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr + tags: [core, pointwise] + +- func: log1p_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log1p.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: log1p_sparse_ + SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_ + tags: pointwise + +- func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log1p_out + MPS: log1p_out_mps + SparseCPU, SparseCUDA: log1p_sparse_out + SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out + tags: pointwise + +- func: log2(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log2.out + variants: function, method + tags: [core, pointwise] + +- func: log2_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log2.out + variants: function, method + tags: pointwise + +- func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log2_out + MPS: log2_out_mps + tags: pointwise + +- func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: logaddexp_out + MPS: logaddexp_out_mps + tags: pointwise + +- func: logaddexp(Tensor self, Tensor other) -> Tensor + variants: method, function + structured_delegate: logaddexp.out + tags: pointwise + +- func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: logaddexp2_out + MPS: logaddexp2_out_mps + tags: pointwise + +- func: logaddexp2(Tensor self, Tensor other) -> Tensor + variants: method, function + structured_delegate: logaddexp2.out + tags: pointwise + +- func: xlogy.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: xlogy.OutTensor + variants: function, method + tags: pointwise + +- func: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: xlogy + tags: pointwise + +- func: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: xlogy + tags: pointwise + +# xlogy: inplace variant +- func: xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: xlogy.OutTensor + tags: pointwise + +- func: xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: xlogy_ + +# xlogy: out variant +- func: xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: xlogy_out + MPS: xlogy_out_mps + tags: pointwise + +- func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: xlogy_out + tags: pointwise + +- func: xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: xlogy_out + tags: pointwise + +- func: logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: logspace + +- func: logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace + +- func: logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace + +- func: logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace + +- func: logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: logspace_out + CUDA: logspace_cuda_out + +- func: logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace_out + +- func: logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace_out + +- func: logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace_out + +# log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. +- func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: log_softmax_out + +- func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor + structured_delegate: _log_softmax.out + tags: core + +- func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: log_softmax_cpu_out + CUDA: log_softmax_cuda_out + MPS: log_softmax_mps_out + +- func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor + structured_delegate: _log_softmax_backward_data.out + +- func: _log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: log_softmax_backward_cpu_out + CUDA: log_softmax_backward_cuda_out + MPS: log_softmax_backward_mps_out + +- func: _logcumsumexp(Tensor self, int dim) -> Tensor + dispatch: + CPU: _logcumsumexp_cpu + CUDA: _logcumsumexp_cuda + +- func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: _logcumsumexp_out_cpu + CUDA: _logcumsumexp_out_cuda + +- func: logcumsumexp(Tensor self, int dim) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: logcumsumexp + +- func: logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: logcumsumexp_out + +- func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor + variants: function, method + +- func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +- func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logsumexp + +- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + # calls squeeze + CompositeExplicitAutogradNonFunctional: logsumexp_out + +- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor + +- func: matmul(Tensor self, Tensor other) -> Tensor + variants: function, method + dispatch: + CompositeImplicitAutograd: matmul + NestedTensorCPU, NestedTensorCUDA: matmul_nested + +- func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor) + dispatch: + NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested + autogen: matmul_backward.out + +- func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeImplicitAutograd: matmul_out + NestedTensorCPU, NestedTensorCUDA: matmul_out_nested + +# Alias to linalg.matrix_power +- func: matrix_power(Tensor self, int n) -> Tensor + variants: function, method + +# Alias to linalg.matrix_power +- func: matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) + +# Alias to linalg.matrix_exp +- func: matrix_exp(Tensor self) -> Tensor + variants: function, method + +# This function should be deprecated in favor of differential_analytic_matrix_function in FunctionsManual.cpp +- func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor + +# DEPRECATED: Use torch.aminmax instead +- func: _aminmax(Tensor self) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: _aminmax_all + autogen: _aminmax.out + +# DEPRECATED: Use torch.aminmax instead +- func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: _aminmax + autogen: _aminmax.dim_out + +- func: aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max) + device_check: NoCheck # TensorIterator + structured_delegate: aminmax.out + variants: function, method + +- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: aminmax_out + MPS: aminmax_out_mps + +- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor + dispatch: + CPU, CUDA: _compute_linear_combination + +- func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: _compute_linear_combination_out + +- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + structured_delegate: max.dim_max + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: qmax + tags: core + +- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: max_out + MPS: max_out_mps + +- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: value_selecting_reduction_backward_symint + +- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor + variants: function, method + structured_delegate: amax.out + tags: core + +- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: amax_out + MPS: amax_out_mps + +# Return: (Tensor output, Tensor indices) +- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) + +- func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor + +- func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + CompositeImplicitAutograd: max_pool2d + MPS: mps_max_pool2d + +- func: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MPS: mps_max_pool2d_backward + autogen: max_pool2d_backward.out + +- func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool2d + autogen: mkldnn_max_pool2d.out + +- func: mkldnn_max_pool2d_backward(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool2d_backward + autogen: mkldnn_max_pool2d_backward.out + +- func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool3d + autogen: mkldnn_max_pool3d.out + +- func: mkldnn_max_pool3d_backward(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool3d_backward + autogen: mkldnn_max_pool3d_backward.out + +- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + QuantizedCPU: quantized_max_pool1d + autogen: quantized_max_pool1d.out + +- func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + QuantizedCPU: quantized_max_pool2d + QuantizedCUDA: quantized_max_pool2d_cudnn + autogen: quantized_max_pool2d.out + +- func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + QuantizedCPU: quantized_max_pool3d + autogen: quantized_max_pool3d.out + +- func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + +# The CPU and GPU dispatch variants are named weirdly here because otherwise there +# are namespacing issues in C++ +- func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: mean + tags: core + +# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this. +# FIXME: fix CI jobs and re-enable this +#- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +# device_check: NoCheck # TensorIterator +# dispatch: +# CompositeExplicitAutograd: mean_dtype_out + +- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: mean.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + QuantizedCPU: mean_quantized_cpu + tags: core + +- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: mean_out + MPS: mean_out_mps + QuantizedCPU: mean_out_quantized_cpu + +- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # Composite + variants: function, method + +- func: nanmean.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # Composite + +- func: median(Tensor self) -> Tensor + variants: function, method + dispatch: + CPU: median_cpu + CUDA: median_cuda + MPS: median_mps + autogen: median.out + +- func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CompositeExplicitAutograd: median + +- func: median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU: median_out_cpu + CUDA: median_out_cuda + MPS: median_out_mps + +- func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: nanmedian(Tensor self) -> Tensor + variants: function, method + dispatch: + CPU: nanmedian_cpu + CUDA: nanmedian_cuda + autogen: nanmedian.out + +- func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CompositeExplicitAutograd: nanmedian + +- func: nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU: nanmedian_out_cpu + CUDA: nanmedian_out_cuda + +- func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + structured_delegate: min.dim_min + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: qmin + tags: core + +- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: min_out + MPS: min_out_mps + +- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor + variants: function, method + structured_delegate: amin.out + tags: core + +- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: amin_out + MPS: amin_out_mps + +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + MPS: _mps_convolution + autogen: _mps_convolution.out + +- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MPS: mps_convolution_backward + autogen: mps_convolution_backward.out + +- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CompositeExplicitAutograd: mkldnn_convolution + autogen: mkldnn_convolution.out + +- func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: mkldnn_rnn_layer + MkldnnCPU: mkldnn_rnn_layer + autogen: mkldnn_rnn_layer.out + +- func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: mkldnn_rnn_layer_backward + autogen: mkldnn_rnn_layer_backward.out + +- func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: miopen_batch_norm + autogen: miopen_batch_norm.out + +- func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: miopen_batch_norm_backward + autogen: miopen_batch_norm_backward.out + +- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor + dispatch: + CUDA: miopen_convolution + autogen: miopen_convolution.out + +- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor + dispatch: + CUDA: miopen_convolution_transpose + autogen: miopen_convolution_transpose.out + +- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor + dispatch: + CUDA: miopen_depthwise_convolution + autogen: miopen_depthwise_convolution.out + +- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CUDA: miopen_convolution_relu + +- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CUDA: miopen_convolution_add_relu + +- func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: miopen_rnn + autogen: miopen_rnn.out + tags: nondeterministic_seeded + + +- func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) + dispatch: + CUDA: miopen_rnn_backward + autogen: miopen_rnn_backward.out + +- func: mm(Tensor self, Tensor mat2) -> Tensor + structured_delegate: mm.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: _sparse_mm + SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm + tags: core + +- func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: mm_out_cpu + CUDA: mm_out_cuda + MPS: mm_out_mps + SparseCPU, SparseCUDA: _sparse_mm_out + SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out + +- func: _int_mm(Tensor self, Tensor mat2) -> Tensor + dispatch: + CPU: _int_mm_cpu + CUDA: _int_mm_cuda + +- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: _int_mm_out_cpu + CUDA: _int_mm_out_cuda + +- func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor + dispatch: + CPU: _convert_weight_to_int4pack_cpu + CUDA: _convert_weight_to_int4pack_cuda + +- func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor + dispatch: + CPU: _weight_int4pack_mm_cpu + MPS: _weight_int4pack_mm_mps + CUDA: _weight_int4pack_mm_cuda + +- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor + dispatch: + CPU: _weight_int8pack_mm_cpu + MPS: _weight_int8pack_mm_mps + +- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor + python_module: sparse + +- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor + python_module: sparse + +- func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor + dispatch: + SparseCPU: sparse_sparse_matmul_cpu + SparseCUDA: sparse_sparse_matmul_cuda + autogen: _sparse_sparse_matmul.out + +- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CPU, CUDA: mode + +- func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CompositeExplicitAutograd: mode_out + +- func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: mul.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: mul.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: mul_sparse + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr + MkldnnCPU: mkldnn_mul + ZeroTensor: mul_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor + tags: [core, pointwise] + +- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: mul.out + variants: method + dispatch: + SparseCPU, SparseCUDA: mul_sparse_ + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_ + MkldnnCPU: mkldnn_mul_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor + tags: pointwise + +- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: mul_out + MPS: mul_out_mps + SparseCPU: mul_out_sparse_cpu + SparseCUDA: mul_out_sparse_cuda + SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr + MkldnnCPU: mkldnn_mul_out + tags: pointwise + # For C++ only, until we have conversion from C++ numbers to Tensor + +- func: mul.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: mul + SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar + tags: [core, pointwise] + +- func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: mul_ + SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar + autogen: mul.Scalar_out + tags: pointwise +# multiply, alias for mul + +- func: multiply.Tensor(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: multiply.Scalar(Tensor self, Scalar other) -> Tensor + variants: function, method + +- func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: mv(Tensor self, Tensor vec) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: mv + SparseCPU, SparseCUDA: mv_sparse + +- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: mv_out + +- func: mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: mvlgamma_out + tags: pointwise + +- func: mvlgamma(Tensor self, int p) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: mvlgamma + tags: pointwise + +- func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: mvlgamma_ + tags: pointwise + +- func: narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor + variants: function, method + dispatch: + CPU: narrow_copy_dense_cpu + SparseCPU, SparseCUDA: narrow_copy_sparse + CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint + tags: view_copy + +- func: narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: narrow_copy_dense_cpu_out + +- func: narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: narrow_symint + NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint + +- func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: narrow_tensor_symint + +- func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: batch_norm_cpu + CUDA: batch_norm_cuda + MPS: batch_norm_mps + MkldnnCPU: mkldnn_batch_norm + +- func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + dispatch: + CUDA: batch_norm_cuda_out + MPS: batch_norm_mps_out + CPU: batch_norm_cpu_out + +# TODO: In 2 weeks, we should make native_batch_norm composite implicit so that this correct schema percolates correctly through our dispatching +- func: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: _batch_norm_legit_cpu + CUDA: _batch_norm_legit_cuda + MPS: _batch_norm_legit_mps + MkldnnCPU: _mkldnn_batch_norm_legit + autogen: _native_batch_norm_legit_functional + tags: core + +# HACK: identical to _native_batch_norm_legit, but training is known to be False, +# So we known that running stats will not be mutated. +# The real fix here is batch norm consolidation. +- func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _batch_norm_legit_no_training + autogen: _native_batch_norm_legit_no_training.out + tags: core + +- func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!)) + dispatch: + CPU: _batch_norm_legit_cpu_out + CUDA: _batch_norm_legit_cuda_out + MPS: _batch_norm_legit_mps_out + +- func: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: _batch_norm_legit_no_stats_cpu + CUDA: _batch_norm_legit_no_stats_cuda + MPS: _batch_norm_legit_no_stats_mps + MkldnnCPU: _mkldnn_batch_norm_legit_no_stats + tags: core + +- func: _native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + dispatch: + CPU: _batch_norm_legit_no_stats_cpu_out + CUDA: _batch_norm_legit_no_stats_cuda_out + MPS: _batch_norm_legit_no_stats_mps_out + +- func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) + dispatch: + CUDA: batch_norm_stats_cuda + autogen: batch_norm_stats.out + +- func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor + dispatch: + CUDA: batch_norm_elemt_cuda + +- func: batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CUDA: batch_norm_elemt_cuda_out + +# for backward compatibility +- func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor) + dispatch: + CUDA: batch_norm_gather_stats_cuda + autogen: batch_norm_gather_stats.out + +- func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor) + dispatch: + CUDA: batch_norm_gather_stats_with_counts_cuda + autogen: batch_norm_gather_stats_with_counts.out + +- func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: batch_norm_backward_cpu + CUDA: batch_norm_backward_cuda + MPS: batch_norm_backward_mps + MkldnnCPU: mkldnn_batch_norm_backward + autogen: native_batch_norm_backward.out + +- func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: batch_norm_backward_reduce_cuda + autogen: batch_norm_backward_reduce.out + +- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor + dispatch: + CUDA: batch_norm_backward_elemt_cuda + autogen: batch_norm_backward_elemt.out + +- func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor) + dispatch: + CPU: batch_norm_update_stats_cpu + CUDA: batch_norm_update_stats_cuda + autogen: batch_norm_update_stats.out + +- func: is_vulkan_available() -> bool + +- func: _nnpack_available() -> bool + +- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _nnpack_spatial_convolution + autogen: _nnpack_spatial_convolution.out + +- func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: ones + autogen: ones.names_out + +- func: ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: ones + +- func: ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: ones_out + +- func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: ones_like + NestedTensorCPU, NestedTensorCUDA: ones_like + autogen: ones_like.out + +- func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor + +- func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor + +- func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor + dispatch: + CompositeExplicitAutograd: _euclidean_dist + autogen: _euclidean_dist.out + +- func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor + dispatch: + CPU, CUDA: _cdist_forward + MPS: _cdist_forward_mps + autogen: _cdist_forward.out + tags: core + +- func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor + dispatch: + CPU, CUDA: _cdist_backward + autogen: _cdist_backward.out + +- func: pdist(Tensor self, float p=2) -> Tensor + +- func: _pdist_forward(Tensor self, float p=2) -> Tensor + dispatch: + CPU, CUDA: _pdist_forward + autogen: _pdist_forward.out + tags: core + +- func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor + dispatch: + CPU, CUDA: _pdist_backward + autogen: _pdist_backward.out + +- func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor + variants: function + +- func: permute(Tensor(a) self, int[] dims) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: permute + MPS: permute_mps + SparseCPU, SparseCUDA: permute_sparse_coo + tags: core + +- func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) + variants: function, method + +- func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a) + variants: function, method + +# moveaxis, alias for movedim +- func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) + variants: function, method + +- func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a) + variants: function, method + +# Only exposed from C++ -- in Python, +# we expose it as an attribute `T`, not a function. +# +# I'd like to name this "T" in C++ too, but +# calling a native function "T" causes undefined +# behavior on Windows, for reasons I don't understand +# (maybe related to capital letter collation somehow...) +- func: numpy_T(Tensor(a) self) -> Tensor(a) + variants: method + +# Exposed on Python as an attribute 'H' +- func: matrix_H(Tensor(a) self) -> Tensor(a) + variants: method + +# Exposed on Python as an attribute 'mT' +- func: mT(Tensor(a) self) -> Tensor(a) + variants: method + +# Exposed on Python as an attribute 'mH' +- func: mH(Tensor(a) self) -> Tensor(a) + variants: method + +- func: adjoint(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor + dispatch: + CPU: pixel_shuffle_cpu + MPS: pixel_shuffle_mps + CompositeExplicitAutogradNonFunctional: math_pixel_shuffle + autogen: pixel_shuffle.out + +- func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor + dispatch: + CPU: pixel_unshuffle_cpu + MPS: pixel_unshuffle_mps + CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle + autogen: pixel_unshuffle.out + +- func: channel_shuffle(Tensor self, SymInt groups) -> Tensor + dispatch: + CPU, CUDA: channel_shuffle + QuantizedCPU: channel_shuffle_quantized_cpu + autogen: channel_shuffle.out + +- func: native_channel_shuffle(Tensor self, SymInt groups) -> Tensor + dispatch: + CPU: channel_shuffle_cpu + CompositeImplicitAutograd: math_channel_shuffle + +- func: is_pinned(Tensor self, Device? device=None) -> bool + variants: method + dispatch: + NestedTensorCUDA, CUDA: is_pinned_cuda + MPS: is_pinned_mps + CompositeExplicitAutograd: is_pinned_default + +# TODO: add a copy kwarg that guarantees that the tensor is put into fresh +# pinned memory +- func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a) + variants: method + +# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor +- func: _pin_memory(Tensor self, Device? device=None) -> Tensor + dispatch: + CUDA: _pin_memory_cuda + MPS: _pin_memory_mps + NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested + autogen: _pin_memory.out + +- func: pinverse(Tensor self, float rcond=1e-15) -> Tensor + variants: function, method + +- func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor + variants: function + +- func: rad2deg(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: rad2deg + SparseCPU, SparseCUDA: rad2deg_sparse + SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr + +- func: rad2deg_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: rad2deg_ + SparseCPU, SparseCUDA: rad2deg_sparse_ + SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_ + +- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: rad2deg_out + SparseCPU, SparseCUDA: rad2deg_sparse_out + SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_out + +- func: deg2rad(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: deg2rad + SparseCPU, SparseCUDA: deg2rad_sparse + SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr + tags: pointwise + +- func: deg2rad_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: deg2rad_ + SparseCPU, SparseCUDA: deg2rad_sparse_ + SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_ + tags: pointwise + +- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: deg2rad_out + SparseCPU, SparseCUDA: deg2rad_sparse_out + SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_out + tags: pointwise + +- func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: scalar_tensor + autogen: scalar_tensor.out + tags: core + +- func: rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: rand + autogen: rand.names_out + tags: nondeterministic_seeded + +- func: rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: rand + autogen: rand.generator_with_names_out + +- func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: [core, nondeterministic_seeded] + dispatch: + CompositeExplicitAutograd: rand + +- func: rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: rand + +- func: rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: rand_out + +- func: rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: rand_like + autogen: rand_like.out + +- func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: randint_like + autogen: randint_like.out + +- func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: randint_like + autogen: randint_like.low_dtype_out + +- func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: [core, nondeterministic_seeded] + dispatch: + CompositeExplicitAutograd: randn + +- func: randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randn + +- func: randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: randn + autogen: randn.names_out + +- func: randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: randn + autogen: randn.generator_with_names_out + +- func: randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like + autogen: randn_like.out + +- func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: [core, nondeterministic_seeded] + dispatch: + CompositeExplicitAutograd: randperm + +- func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randperm + +- func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randperm_out + +- func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CPU: randperm_out_cpu + CUDA: randperm_out_cuda + MPS: randperm_out_mps + +- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: range + +- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: range + +- func: range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: range_out_no_step + +- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: range_out + CUDA: range_cuda_out + MPS: range_mps_out + cpp_no_default_args: ['step'] + +- func: ravel(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: reciprocal(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: reciprocal.out + variants: function, method + tags: [core, pointwise] + +- func: reciprocal_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: reciprocal.out + variants: function, method + tags: pointwise + +- func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: reciprocal_out + MPS: reciprocal_out_mps + tags: pointwise + +- func: neg(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: neg.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: neg_sparse + SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg + tags: [core, pointwise] + +- func: neg_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: neg.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: neg_sparse_ + SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_ + tags: pointwise + +- func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: neg_out + MPS: neg_out_mps + SparseCPU, SparseCUDA: neg_out_sparse + SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out + tags: pointwise +# Alias for neg + +- func: negative(Tensor self) -> Tensor + variants: function, method + +- func: negative_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: repeat(Tensor self, SymInt[] repeats) -> Tensor + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + dispatch: + CompositeExplicitAutograd: repeat + MPS: repeat_mps + autogen: repeat.out + tags: core + +- func: repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor + variants: function + dispatch: + CPU: repeat_interleave_cpu + CUDA: repeat_interleave_cuda + MPS: repeat_interleave_mps + tags: dynamic_output_shape + autogen: repeat_interleave.Tensor_out + +- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor + variants: function, method + dispatch: + CompositeImplicitAutograd: repeat_interleave_symint + +- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor + variants: function, method + dispatch: + CompositeImplicitAutograd: repeat_interleave_symint + +- func: reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: reshape_symint + CompositeImplicitAutogradNestedTensor: reshape_nested_symint + +- func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _reshape_copy_symint + +# NOTE [ _reshape_alias ] is meant to be used in the implementation of reshape. +# They are not user-facing, hence the leading underscore. Please don't use it +# anywhere else. +- func: _reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias + # We don't need to support mkldnn since this is handled explicitly by the reshape operator. + +- func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: mkldnn_reshape + autogen: _mkldnn_reshape.out + +- func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: reshape_as + CompositeImplicitAutogradNestedTensor: reshape_as_nested + +- func: round(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: round.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: round_sparse + SparseCsrCPU, SparseCsrCUDA: round_sparse_csr + tags: [core, pointwise] + +- func: round_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: round.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: round_sparse_ + SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_ + tags: pointwise + +- func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU: round_out + CUDA: round_out + MPS: round_out_mps + SparseCPU, SparseCUDA: round_sparse_out + SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out + tags: pointwise + +- func: round.decimals(Tensor self, *, int decimals) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: round.decimals_out + variants: function, method + tags: pointwise + +- func: round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: round.decimals_out + variants: function, method + tags: pointwise + +- func: round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU: round_decimals_out + CUDA: round_decimals_out + tags: pointwise + +- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + +- func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) + tags: nondeterministic_seeded + device_check: NoCheck # TensorIterator + +- func: relu(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: relu + MPS: relu_mps + MkldnnCPU: mkldnn_relu + QuantizedCPU: relu_quantized_cpu + QuantizedCUDA: relu_quantized_cuda + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu + SparseCPU, SparseCUDA: relu_sparse + SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr + tags: [core, pointwise] + +- func: relu_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: relu_ + MPS: relu_mps_ + MkldnnCPU: mkldnn_relu_ + QuantizedCPU: relu_quantized_cpu_ + QuantizedCUDA: relu_quantized_cuda_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_ + SparseCPU, SparseCUDA: relu_sparse_ + SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr_ + autogen: relu.out + tags: pointwise + +- func: relu6(Tensor self) -> Tensor + python_module: nn + +- func: relu6_(Tensor(a!) self) -> Tensor(a!) + python_module: nn + +- func: prelu(Tensor self, Tensor weight) -> Tensor + variants: function, method + autogen: prelu.out + +- func: _prelu_kernel(Tensor self, Tensor weight) -> Tensor + dispatch: + CPU, CUDA: _prelu_kernel + QuantizedCPU: _prelu_kernel_quantized_cpu + MkldnnCPU: mkldnn_prelu + MPS: prelu_mps + +- func: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: _prelu_kernel_backward + MkldnnCPU: mkldnn_prelu_backward + MPS: prelu_backward_mps + +- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU: gelu_out_cpu + CUDA: gelu_out_cuda + MPS: gelu_out_mps + +- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!) + structured_delegate: gelu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + QuantizedCPU: gelu_quantized_cpu_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_ + +- func: gelu(Tensor self, *, str approximate='none') -> Tensor + structured_delegate: gelu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + MkldnnCPU: mkldnn_gelu + QuantizedCPU: gelu_quantized_cpu + QuantizedCUDA: gelu_quantized_cuda + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu + tags: [core, pointwise] + +- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU: gelu_backward_out_cpu + CUDA: gelu_backward_out_cuda + MPS: gelu_backward_out_mps + +- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor + structured_delegate: gelu_backward.grad_input + python_module: nn + dispatch: + MkldnnCPU: mkldnn_gelu_backward + NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested + tags: pointwise + +- func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor + variants: function + python_module: nn + device_check: NoCheck + device_guard: False + +- func: hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: hardshrink_out + +- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor + structured_delegate: hardshrink.out + device_check: NoCheck # TensorIterator + variants: function, method + +- func: hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: hardshrink_backward_out + +- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor + structured_delegate: hardshrink_backward.grad_input + variants: function, method + +- func: rsqrt(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: rsqrt.out + variants: function, method + tags: [core, pointwise] + +- func: rsqrt_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: rsqrt.out + variants: function, method + tags: pointwise + +- func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: rsqrt_out + MPS: rsqrt_out_mps + tags: pointwise + +- func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: select_symint + SparseCsrCPU, SparseCsrCUDA: select_sparse_csr + NestedTensorCPU, NestedTensorCUDA: select_nested + tags: core + +- func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: select_backward_symint + autogen: select_backward.out + +- func: _nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint + +- func: selu(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + +- func: selu_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: celu + +- func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: celu_ + autogen: celu.out + +- func: silu(Tensor self) -> Tensor + structured_delegate: silu.out + python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu + tags: pointwise + +- func: silu_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: silu.out + python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_ + tags: pointwise + +- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: silu_out + MPS: silu_out_mps + tags: pointwise + +- func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: silu_backward_out + MPS: silu_backward_out_mps + tags: pointwise + +- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor + structured_delegate: silu_backward.grad_input + python_module: nn + dispatch: + CompositeImplicitAutograd: math_silu_backward + NestedTensorCPU, NestedTensorCUDA: silu_backward_nested + tags: pointwise + +- func: mish(Tensor self) -> Tensor + structured_delegate: mish.out + python_module: nn + +- func: mish_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: mish.out + python_module: nn + +- func: mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: mish_out + MPS: mish_out_mps + +- func: mish_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: mish_backward + MPS: mish_backward_mps + CompositeImplicitAutograd: math_mish_backward + +- func: sigmoid(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sigmoid.out + variants: function, method + dispatch: + QuantizedCPU: sigmoid_quantized_cpu + MkldnnCPU: mkldnn_sigmoid + tags: [core, pointwise] + +- func: sigmoid_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sigmoid.out + variants: function, method + dispatch: + MkldnnCPU: mkldnn_sigmoid_ + tags: pointwise + +- func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sigmoid_out + MPS: sigmoid_out_mps + tags: pointwise + +- func: logit(Tensor self, float? eps=None) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: logit + MPS: logit_mps + tags: pointwise + +- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!) + variants: function, method + dispatch: + CPU, CUDA: logit_ + tags: pointwise + +- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logit_out + MPS: logit_out_mps + tags: pointwise + +- func: sin(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sin.out + variants: function, method + dispatch: + SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr + SparseCPU, SparseCUDA: sin_sparse + NestedTensorCPU, NestedTensorCUDA: sin_nested + tags: [core, pointwise] + +- func: sin_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sin.out + variants: function, method + dispatch: + SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_ + SparseCPU, SparseCUDA: sin_sparse_ + tags: pointwise + +- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sin_out + MPS: sin_out_mps + SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out + SparseCPU, SparseCUDA: sin_sparse_out + tags: pointwise + +- func: sinc(Tensor self) -> Tensor + structured_delegate: sinc.out + variants: function, method + tags: pointwise + +- func: sinc_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: sinc.out + variants: function, method + tags: pointwise + +- func: sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sinc_out + tags: pointwise + +- func: sinh(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sinh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sinh_sparse + SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr + tags: [core, pointwise] + +- func: sinh_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sinh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sinh_sparse_ + SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_ + tags: pointwise + +- func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sinh_out + MPS: sinh_out_mps + SparseCPU, SparseCUDA: sinh_sparse_out + SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out + +# Returns a copy of this `Variable` that is detached from its autograd graph. +# This method is OK to call if the `Variable` is a view. +# +# NOTE: Previously, if we change the tensor metadata (e.g. sizes / strides / +# storage / storage_offset) of a tensor created from `detach()`, those metadata +# in the original tensor will also be updated. However, the new behavior is that +# those metadata changes to the detached tensor will not update the original tensor +# anymore, and in the `detach()` function we need to set `allow_tensor_metadata_change_` +# to false to make such changes explicitly illegal, in order to prevent users from +# changing metadata of the detached tensor and expecting the original tensor to also +# be updated. + tags: pointwise +- func: detach(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: detach + NestedTensorCPU, NestedTensorCUDA: detach + +# Like `detach()`, but modifies this `Variable` in-place. This method may +# only be called on non-view `Variable`s. You can use `is_view()` to check +# this. If this `Variable` is a view, throws an `std::runtime_error()`. +- func: detach_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + tags: inplace_view + dispatch: + CompositeExplicitAutograd: detach_ + +- func: size.int(Tensor self, int dim) -> int + variants: function + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: size.Dimname(Tensor self, Dimname dim) -> int + variants: function, method + device_check: NoCheck + device_guard: False + +- func: sym_size.int(Tensor self, int dim) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sym_numel(Tensor self) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sym_storage_offset(Tensor self) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: slice + tags: core + +# NOTE: The implementation of split_with_sizes bypasses the dispatcher to call this; undo +# that if adding specific implementations here! + +- func: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: slice_backward + autogen: slice_backward.out + +# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk, +# slice.Tensor, split_with_sizes, et al.). Currently, these are only used during fake-ification +# of PT2 graph input subclass instances that are views. This means: +# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it) +# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it) +# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph +# input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is +# easier to implement for a subclass than as_strided() +- func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: slice_inverse_symint + +- func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: slice_scatter + autogen: slice_scatter.out + tags: [core, view_copy] + +- func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: select_scatter_symint + autogen: select_scatter.out + tags: core + +- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: diagonal_scatter + autogen: diagonal_scatter.out + +- func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint + autogen: as_strided_scatter.out + +- func: smm(Tensor self, Tensor mat2) -> Tensor + variants: function, method + +# softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. +- func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: softmax_out + +- func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor + structured_delegate: _softmax.out + dispatch: + MkldnnCPU: mkldnn_softmax + NestedTensorCPU, NestedTensorCUDA: softmax_nested + tags: core + +- func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: softmax_cpu_out + CUDA: softmax_cuda_out + MPS: softmax_mps_out + +- func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor + structured_delegate: _softmax_backward_data.out + dispatch: + NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward + +- func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + dispatch: + CPU: softmax_backward_cpu_out + CUDA: softmax_backward_cuda_out + MPS: softmax_backward_mps_out + +- func: unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: unsafe_split + autogen: unsafe_split.Tensor_out + +- func: split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: split + +- func: split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[] + variants: function, method + device_guard: False + dispatch: + CompositeImplicitAutograd: split_symint + +- func: unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: unsafe_split_with_sizes + autogen: unsafe_split_with_sizes.out + +- func: split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: split_with_sizes + NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested + tags: core + +- func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] + variants: function, method + +- func: hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] + variants: function, method + +- func: vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] + variants: function, method + +- func: vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] + variants: function, method + +- func: dsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] + variants: function, method + +- func: dsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] + variants: function, method + +- func: squeeze(Tensor(a) self) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: squeeze + QuantizedCPU, QuantizedCUDA: squeeze_quantized + NestedTensorCPU, NestedTensorCUDA: squeeze_nested + +- func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: squeeze + QuantizedCPU, QuantizedCUDA: squeeze_quantized + NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested + tags: core + +- func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + + +- func: squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: squeeze + QuantizedCPU, QuantizedCUDA: squeeze_quantized + NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested + tags: core + +- func: squeeze_(Tensor(a!) self) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: squeeze_ + +- func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: squeeze_ + +- func: squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: squeeze_ + +- func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + +- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + +- func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: _sspaddmm_out_only_sparse + CUDA: _sspaddmm_out_only_sparse_cuda + SparseCPU: _sspaddmm_out_cpu + SparseCUDA: _sspaddmm_out_cuda + +- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor + dispatch: + CompositeExplicitAutograd: _chunk_cat + CUDA: _chunk_cat_cuda + +- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: _chunk_cat_out + CUDA: _chunk_cat_out_cuda + +- func: stack(Tensor[] tensors, int dim=0) -> Tensor + dispatch: + CompositeExplicitAutograd: stack + +- func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: stack_out + +- func: _stack(Tensor[] tensors, int dim=0) -> Tensor + dispatch: # match the backends supported by _cat + CPU: _stack_cpu + CompositeExplicitAutograd: _stack + +- func: _stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: # match the backends supported by _cat_out + CPU: _stack_out_cpu + CompositeExplicitAutograd: _stack_out + +- func: hstack(Tensor[] tensors) -> Tensor + +- func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: vstack(Tensor[] tensors) -> Tensor + +- func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: dstack(Tensor[] tensors) -> Tensor + +- func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +# Overload without center & pad mode, needed for forward-compatibility +- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + variants: function, method + cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized'] + +- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + variants: function, method + +- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor + variants: function, method + +- func: stride.int(Tensor self, int dim) -> int + variants: function + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: stride.Dimname(Tensor self, Dimname dim) -> int + variants: function, method + device_check: NoCheck + device_guard: False + +- func: sym_stride.int(Tensor self, int dim) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: sum + SparseCPU, SparseCUDA, SparseMeta: sum_coo + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr + autogen: sum.out + +- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype + structured_delegate: sum.IntList_out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + NestedTensorCPU: NestedTensor_sum_dim_CPU + SparseCPU, SparseCUDA: sum_sparse_coo + SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed + tags: core + +- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: sum_out + MPS: sum_out_mps + +- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +# TODO: this function will be replaced once nested expand semantics have been settled on +- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor + dispatch: + NestedTensorCPU: _nested_sum_backward_cpu + +- func: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: nansum + MPS: nansum_mps + +- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nansum_out + MPS: nansum_out_mps + +- func: sum_to_size(Tensor self, SymInt[] size) -> Tensor + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: sum_to_size_symint + +- func: sqrt(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sqrt.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sqrt_sparse + SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr + tags: [core, pointwise] + +- func: sqrt_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sqrt.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sqrt_sparse_ + SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_ + tags: pointwise + +- func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sqrt_out + MPS: sqrt_out_mps + SparseCPU, SparseCUDA: sqrt_sparse_out + SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out + tags: pointwise + +- func: square(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: pointwise + +- func: square_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + tags: pointwise + +- func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: std(Tensor self, bool unbiased=True) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: std + MPS: std_mps + QuantizedCPU: std_quantized_cpu + +- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: std_mean + MPS: std_mean_mps + autogen: std_mean.correction_out + +- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + +- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: std_out + QuantizedCPU: std_out_quantized_cpu + +- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + +- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: prod + MPS: prod_mps + autogen: prod.out + tags: core + +- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: prod.int_out + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + +- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: prod_out + MPS: prod_out_mps + +- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: t(Tensor(a) self) -> Tensor(a) + device_check: NoCheck + device_guard: False + variants: function, method + dispatch: + CompositeExplicitAutograd: t + +- func: t_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck + device_guard: False + variants: method + tags: inplace_view + dispatch: + CompositeExplicitAutograd: t_ + +- func: tan(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: tan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: tan_sparse + SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr + tags: [core, pointwise] + +- func: tan_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: tan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: tan_sparse_ + SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_ + tags: pointwise + +- func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: tan_out + MPS: tan_out_mps + SparseCPU, SparseCUDA: tan_sparse_out + SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out + tags: pointwise + +- func: tanh(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: tanh.out + variants: function, method + dispatch: + QuantizedCPU: tanh_quantized_cpu + MkldnnCPU: mkldnn_tanh + SparseCPU, SparseCUDA: tanh_sparse + SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh + tags: [core, pointwise] + +- func: tanh_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: tanh.out + variants: function, method + dispatch: + MkldnnCPU: mkldnn_tanh_ + SparseCPU, SparseCUDA: tanh_sparse_ + SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_ + tags: pointwise + +- func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: tanh_out + MPS: tanh_out_mps + SparseCPU, SparseCUDA: tanh_sparse_out + SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out + tags: pointwise + +- func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor + variants: function + +- func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +# TODO: namespace threshold in 'nn' +- func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + structured_delegate: threshold.out + dispatch: + QuantizedCPU: threshold_quantized_cpu + +- func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + structured_delegate: threshold.out + +- func: threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: threshold_out + MPS: threshold_out_mps + +- func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: threshold_backward_out + MPS: threshold_backward_out_mps + SparseCPU, SparseCUDA: threshold_backward_sparse_out + SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed_out + +- func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor + variants: function + structured_delegate: threshold_backward.grad_input + dispatch: + MkldnnCPU: mkldnn_relu_backward + SparseCPU, SparseCUDA: threshold_backward_sparse + SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed + NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested + tags: pointwise + +- func: tile(Tensor self, SymInt[] dims) -> Tensor + variants: function, method + dispatch: + CompositeImplicitAutograd: tile_symint + +- func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: transpose + NestedTensorCPU, NestedTensorCUDA: transpose_nested + +- func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: mkldnn_transpose + +- func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: transpose_ + +- func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: mkldnn_transpose_ + autogen: _mkldnn_transpose.out + +- func: one_hot(Tensor self, int num_classes=-1) -> Tensor + python_module: nn + variants: function + tags: dynamic_output_shape + +- func: flip(Tensor self, int[] dims) -> Tensor + variants: function, method + dispatch: + CPU, QuantizedCPU, CUDA, QuantizedCUDA: flip + MPS: flip_mps + autogen: flip.out + tags: core + +- func: fliplr(Tensor self) -> Tensor + variants: function, method + +- func: flipud(Tensor self) -> Tensor + variants: function, method + +- func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor + variants: function, method + dispatch: + CPU, MPS: roll + CUDA: roll_cuda + autogen: roll.out + +# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args + +- func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: rot90 + autogen: rot90.out + +- func: trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor + +- func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor + +# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads). +- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu + CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda + autogen: _transform_bias_rescale_qkv.out + +- func: _nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor + dispatch: + CPU, CUDA: NestedTensor_nested_tensor_from_mask + autogen: _nested_tensor_from_mask.out + +- func: _nested_tensor_from_mask_left_aligned(Tensor t, Tensor mask) -> bool + dispatch: + CPU, CUDA: NestedTensor_nested_tensor_from_mask_left_aligned + +- func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor + device_check: NoCheck # cpu_nested_shape_example will always be on CPU + dispatch: + CPU: nested_from_padded_generic + CUDA: nested_from_padded_cuda + autogen: _nested_from_padded.out + +# These private functions are temporary. They will be updated/deleted when nested tensors switch to using SymInts for their metadata representation +- func: _nested_tensor_size(Tensor self) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size + autogen: _nested_tensor_size.out + +- func: _nested_tensor_strides(Tensor self) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides + autogen: _nested_tensor_strides.out + +- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets + autogen: _nested_tensor_storage_offsets.out + +# _nested_from_padded is not usable from Python, so +# _nested_from_padded_and_nested_example is available for testing. +- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example + autogen: _nested_from_padded_and_nested_example.out + +# The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation +# this will need to be updated +- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: + CPU, CUDA: _nested_view_from_buffer + +- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy + autogen: _nested_view_from_buffer_copy.out + +- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy + autogen: _nested_view_from_jagged_copy.out + +- func: _nested_get_values(Tensor(a) self) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_values_copy(Tensor self) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_get_values_copy + autogen: _nested_get_values_copy.out + +- func: _nested_get_offsets(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + +# returns undefined Tensor if no lengths present +- func: _nested_get_lengths(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_ragged_idx(Tensor self) -> int + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_jagged_dummy(Tensor any) -> Tensor + category_override: dummy + dispatch: {} + +- func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor) + variants: function + device_check: NoCheck + dispatch: + CPU, CUDA: _nested_compute_contiguous_strides_offsets + +- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor + dispatch: + # calls unsqueeze + CompositeExplicitAutogradNonFunctional: _trilinear + autogen: _trilinear.out + +- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor + +- func: trunc(Tensor self) -> Tensor + structured_delegate: trunc.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: trunc_sparse + SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr + tags: [core, pointwise] + +- func: trunc_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: trunc.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: trunc_sparse_ + SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_ + tags: pointwise + +- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: trunc_out + MPS: trunc_out_mps + SparseCPU, SparseCUDA: trunc_sparse_out + SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out + tags: pointwise +# Alias for trunc + +- func: fix(Tensor self) -> Tensor + variants: function, method + +- func: fix_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: type_as(Tensor self, Tensor other) -> Tensor + variants: method + +- func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool + variants: function + +- func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: _unique_cpu + CUDA: _unique_cuda + autogen: _unique.out + +- func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: unique_dim_cpu + CUDA: unique_dim_cuda + tags: dynamic_output_shape + autogen: unique_dim.out + +- func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: unique_consecutive_cpu + CUDA: unique_consecutive_cuda + MPS: unique_consecutive_mps + tags: dynamic_output_shape + autogen: unique_consecutive.out + +- func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: unique_dim_consecutive_cpu + CUDA: unique_dim_consecutive_cuda + MPS: unique_dim_consecutive_mps + tags: dynamic_output_shape + autogen: unique_dim_consecutive.out + +# _unique and _unique_dim are fragile and modifying them easily cause internal break +# the below operator is a temporary hack for adding return_counts support +# Please don't rely on these two operators, they will be removed soon + +- func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: _unique2_cpu + CUDA: _unique2_cuda + MPS: _unique2_mps + tags: dynamic_output_shape + autogen: _unique2.out + +- func: _unsafe_view(Tensor self, SymInt[] size) -> Tensor + dispatch: + CompositeExplicitAutograd: _unsafe_view + autogen: _unsafe_view.out + +- func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: unsqueeze + SparseCPU, SparseCUDA: unsqueeze_sparse + QuantizedCPU, QuantizedCUDA: unsqueeze_quantized + NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested + tags: core + +- func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: unsqueeze_ + +- func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor + +- func: var(Tensor self, bool unbiased=True) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + cpp_no_default_args: ["unbiased"] + +- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: var + MPS: var_mps + tags: core + +- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: var_out + +- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + +- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: var_mean + MPS: var_mean_mps + autogen: var_mean.correction_out + +- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + +- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA, MPS: where + tags: [core, pointwise] + +- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA, MPS: where_self_out + +- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor + variants: function + +- func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor + variants: function, method + +- func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor + variants: function + +- func: where(Tensor condition) -> Tensor[] + device_check: NoCheck # TensorIterator + variants: function + +- func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor + variants: function + +# VariableType::_weight_norm does not want to be given a gap in the autograd graph, +# so we don't define "dispatch" variants for it. +- func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor + variants: function + +- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: weight_norm_cpu + CUDA: weight_norm_cuda + MPS: weight_norm_mps + autogen: _weight_norm_interface.out + +- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: weight_norm_backward_cpu + CUDA: weight_norm_backward_cuda + MPS: weight_norm_backward_mps + autogen: _weight_norm_interface_backward.out + +- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) + variants: function + +- func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: zeros + autogen: zeros.names_out + +- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: _efficientzerotensor + CUDA: _efficientzerotensor_cuda + MPS: _efficientzerotensor_mps + Meta: _efficientzerotensor_meta_symint + autogen: _efficientzerotensor.out + +- func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: zeros_symint + +- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: zeros_out + SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out + +- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like + autogen: zeros_like.out + +- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor + variants: function + dispatch: + CPU: _standard_gamma_grad_cpu + CUDA: _standard_gamma_grad_cuda + autogen: _standard_gamma_grad.out + +- func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor + variants: function + dispatch: + CPU: _s_gamma_cpu + CUDA: _s_gamma_cuda + tags: nondeterministic_seeded + autogen: _standard_gamma.out + +- func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor + dispatch: + CPU: _dirichlet_grad_cpu + CUDA: _dirichlet_grad_cuda + autogen: _dirichlet_grad.out + +- func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor + tags: nondeterministic_seeded + variants: function + dispatch: + CPU: _s_dirichlet_cpu + CUDA: _s_dirichlet_cuda + autogen: _sample_dirichlet.out + +- func: poisson(Tensor self, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + dispatch: + CPU: _s_poisson_cpu + CUDA: _s_poisson_cuda + tags: nondeterministic_seeded + autogen: poisson.out + +- func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + dispatch: + CPU: _s_binomial_cpu + CUDA: _s_binomial_cuda + tags: nondeterministic_seeded + autogen: binomial.out + +# When more variants get ported to native, this dispatch will get more +# complicated + +- func: native_norm(Tensor self, Scalar p=2) -> Tensor + dispatch: + SparseCPU, SparseCUDA: norm_sparse + autogen: native_norm.out + +- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor + dispatch: + SparseCPU, SparseCUDA: norm_sparse + autogen: native_norm.ScalarOpt_dim_dtype_out + +- func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: _batch_norm_with_update_cpu + CUDA: _batch_norm_with_update_cuda + MPS: _batch_norm_with_update_mps + MkldnnCPU: _batch_norm_with_update_mkldnn + autogen: _batch_norm_with_update_functional + +- func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!)) + dispatch: + CPU: _batch_norm_with_update_cpu_out + CUDA: _batch_norm_with_update_cuda_out + MPS: _batch_norm_with_update_mps_out + +- func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _batch_norm_no_update + autogen: _batch_norm_no_update.out + +- func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: _new_batch_norm_backward_cpu + CUDA: _new_batch_norm_backward_cuda + MPS: _new_batch_norm_backward_mps + MkldnnCPU: _new_batch_norm_backward_mkldnn + +# TODO: reduce signatures down to one when optional args is available +- func: _sparse_sum(Tensor self) -> Tensor + +- func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor + +- func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor + dispatch: + CompositeExplicitAutograd: _sparse_sum + autogen: _sparse_sum.dim_out + +- func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor + +- func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor + dispatch: + SparseCPU: _sparse_sum_backward_cpu + SparseCUDA: _sparse_sum_backward_cuda + autogen: _sparse_sum_backward.out + +- func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_sum_cpu + SparseCsrCUDA: _sparse_csr_sum_cuda + autogen: _sparse_csr_sum.dim_dtype_out + +- func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_prod_cpu + SparseCsrCUDA: _sparse_csr_prod_cuda + autogen: _sparse_csr_prod.dim_dtype_out + +- func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor + python_module: sparse + dispatch: + SparseCPU: softmax_sparse_cpu + SparseCUDA: softmax_sparse_cuda + autogen: _sparse_softmax.out + +- func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + dispatch: + SparseCPU: softmax_backward_sparse_cpu + SparseCUDA: softmax_backward_sparse_cuda + autogen: _sparse_softmax_backward_data.out + +- func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor + python_module: sparse + dispatch: + SparseCPU: log_softmax_sparse_cpu + SparseCUDA: log_softmax_sparse_cuda + autogen: _sparse_log_softmax.out + +- func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + dispatch: + SparseCPU: log_softmax_backward_sparse_cpu + SparseCUDA: log_softmax_backward_sparse_cuda + autogen: _sparse_log_softmax_backward_data.out + +- func: _spdiags(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None) -> Tensor + python_module: sparse + dispatch: + CPU: spdiags + autogen: _spdiags.out + +- func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: norm + autogen: norm.ScalarOpt_dtype_out + +- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: norm + autogen: norm.Scalar_out + +- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + structured_delegate: norm.dtype_out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sparse_dtype_norm + +- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor + structured_delegate: norm.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sparse_norm + +- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: norm_dtype_out + MPS: norm_dtype_out_mps + +- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: norm_out + MPS: norm_out_mps + +# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd +- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent) + variants: method, function + dispatch: + CompositeExplicitAutograd: frexp + tags: pointwise + +- func: frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) + dispatch: + CPU, CUDA: frexp_out + tags: pointwise + +# Deprecated (v.1.12) +- func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor + variants: function + +# Deprecated (v.1.12) +- func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +- func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: clone + SparseCPU, SparseCUDA: clone_sparse + SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed + MkldnnCPU: mkldnn_clone + QuantizedCPU, QuantizedCUDA: quantized_clone + NestedTensorCPU, NestedTensorCUDA: clone_nested + autogen: clone.out + tags: [core, pointwise] + +- func: positive(Tensor(a) self) -> Tensor(a) + variants: function, method + tags: pointwise + +- func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function, method + dispatch: + CompositeExplicitAutograd: resize_as_ + autogen: resize_as, resize_as.out + tags: inplace_view + +- func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function, method + dispatch: + SparseCPU, SparseCUDA: resize_as_sparse_ + SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_compressed_ + autogen: resize_as_sparse, resize_as_sparse.out + +- func: zero_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: zero_ + MPS: zero_mps_ + Meta: zero_meta_ + SparseCPU, SparseCUDA, SparseMeta: zero_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_ + MkldnnCPU: mkldnn_zero_ + NestedTensorCPU, NestedTensorCUDA: zero_nested_ + autogen: zero, zero.out + +- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sub_out + MPS: sub_out_mps + SparseCPU, SparseCUDA: sub_out_sparse + tags: pointwise + +- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: sub.out + dispatch: + SparseCPU, SparseCUDA: sub_sparse + ZeroTensor: sub_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor + tags: [core, pointwise] + +- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: sub.out + dispatch: + SparseCPU, SparseCUDA: sub_sparse_ + tags: pointwise +# For C++ only, until we have conversion from C++ numbers to Tensor + +- func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: sub + tags: [core, pointwise] + +- func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: sub_ + autogen: sub.Scalar_out + tags: pointwise +# subtract, alias for sub + +- func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + +- func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function, method + +- func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + variants: method + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + variants: function, method + +- func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + variants: method + +- func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: rsub + autogen: rsub.Tensor_out + +- func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: heaviside_out + tags: pointwise + +- func: heaviside(Tensor self, Tensor values) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: heaviside.out + tags: pointwise + +- func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: heaviside.out + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: rsub + autogen: rsub.Scalar_out + +# Functionally the same as addmm, but we give it a different derivative formula +# that doesn't propagate gradients to non-present entries on sparse. + tags: pointwise +- func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + python_module: sparse + dispatch: + CompositeExplicitAutograd: _sparse_addmm + autogen: _sparse_addmm.out + +- func: sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + python_module: sparse + dispatch: + SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu + +- func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + python_module: sparse + dispatch: + SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu + +- func: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor) + python_module: sparse + dispatch: + SparseCsrCPU: _sparse_mm_reduce_impl_sparse_csr_cpu + +- func: _sparse_mm_reduce_impl_backward(Tensor self, Tensor grad_out, Tensor weight, str reduce, Tensor arg_out, bool[2] output_mask) -> (Tensor, Tensor) + python_module: sparse + dispatch: + SparseCsrCPU: _sparse_mm_reduce_impl_backward_sparse_csr_cpu + +- func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmm_out_cpu + CUDA: addmm_out_cuda + MPS: addmm_out_mps + SparseCPU: addmm_out_sparse_dense_cpu + SparseCUDA: addmm_out_sparse_dense_cuda + SparseCsrCPU: addmm_out_sparse_compressed_cpu + SparseCsrCUDA: addmm_out_sparse_compressed_cuda + +- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + structured_delegate: addmm.out + variants: function, method + dispatch: + SparseCPU: addmm_sparse_dense_cpu + SparseCUDA: addmm_sparse_dense_cuda + SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense + tags: core + +- func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + structured_delegate: addmm.out + variants: method + dispatch: + # Warning! For whatever reason, the inplace sparse addmm is NON + # broadcasting + SparseCPU: s_addmm_sparse_dense_cpu_ + SparseCUDA: s_addmm_sparse_dense_cuda_ + +- func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmm_activation_out_cpu + CUDA: addmm_activation_out_cuda + +- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor + structured_delegate: _addmm_activation.out + variants: function, method + +- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: _scaled_mm_cuda + +- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!)) + variants: function + dispatch: + CUDA: _scaled_mm_out_cuda + +# NOTE [ Sparse: autograd and API ] +# +# +# Sparse Tensor Constructors +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The API entry points to sparse tensor construction should be +# `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`. Depending on whether the +# indices and values tensors are given, they eventually dispatch to either +# `sparse_coo_tensor_with_dims` or `sparse_coo_tensor_with_dims_and_tensors`. +# +# The autograd support for ctor is implement on `sparse_coo_tensor_with_dims_and_tensors`. +# +# The API methods `sparse_coo tensor` and `_sparse_coo_tensor_unsafe` +# **must not** have specific type dispatches because otherwise codegen will +# consider them as abstract methods (see Note [Abstract ATen methods]), dispatch +# using **Tensor** type, and thus lose autograd tracking on the actual method +# they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`. +# +# +# Sparse Methods API Design +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Goals: 1. Flexible API for users to write custom sparse ops +# 2. ctor and member accessor with autograd support +# +# To achieve 1, we need to provide a set of *dangerous* APIs (dangerous in the +# sense that misusing them will break sparse tensor invariant and may out in +# unexpected behavior, e.g., crash). These methods are all prefixed with +# underscore "_" to indicate that they should be used with care. We provide: +# +# + `_indices()`: returns the *raw* indices within the sparse tensor (not just +# sharing storage). Any inplace operation will change the +# actual indices, including t_, set_, as_strided_, resize_, +# etc. +# + `_values()`: returns the *raw* values within the sparse tensor. Similar +# semantics as `_indices()` +# + `_nnz()`: returns the number of non-zero entries. This will always be +# determined by the shapes of indices and values. +# + `_coalesced_(bool)`: inplace sets whether the tensor is coalesced, and +# returns itself. +# +# These methods are very useful in writing new operations, e.g., a custom +# autograd Function. +# +# We also provide other public *safe* APIs: +# + `indices()`: returns a **view** of the indices tensor if the sparse tensor +# is **coalesced**. +# + `values()`: returns a **view** of the values tensor if the containing +# sparse tensor is **coalesced**. +# + `sparse_dim()`: number of sparse dimensions +# + `dense_dim()`: number of dense dimensions +# + `is_coalesced()`: whether the sparse tensor is coalesced +# +# `_indices()` and `_values()` should returns the raw indices and values dense +# tensors within a sparse tensor. They can be quite unsafe with inplace +# operations like `t_()`, and exposes uncoalesced indices and values. The public +# recommended API is `indices()` and `values()`, both of which first check that +# the tensor is coalesced and return views on those tensors. +# +# +# Autograd Support +# ~~~~~~~~~~~~~~~~ +# +# Autograd is supported on `values()` and sparse tensor ctor with indices and +# values tensors. E.g., `torch.sparse_coo_tensor(i, v).values().sum()` is +# differentiable w.r.t. `v`. +# +# NB: The `values()` and `_values()` operators are special in that they are +# layout-aware, i.e., the output depends not just on the data it represents, but +# also on the input layout details (in this case, the `indices` tensor). See +# NOTE [ as_strided Backward and layout-aware/agnostic autograd ] in Functions.cpp +# for discussion on layout-aware vs layout-agnostic autograd. Since PyTorch ops +# operate in the layout-agnostic mode, similar to `as_strided`, backward of +# these two operators need to consider them in a layout-agnostic way: +# + `values()`: +# Input is coalesced. +# We just pretend having `input.indices()` as an additional argument +# `input_indices`, then forward is similar to +# `input.to(kStrided).index_select(input_indices)` regardless of the layout. +# Note that `values()` normally is layout-aware even if we constrain +# ourselves on sparse inputs since it may include all zeros values entries +# as "present" entries. +# + `_values()`: +# Input may be uncoalesced. +# It is not straightforward to construct a layout-agnostic version because +# duplicate indices entries may exist and additional parameterization is +# needed to distribute the value into different values entries. Furthermore, +# this op is intended to provide ways to write custom sparse ops, rather +# than being used in autograd graph, so it is marked as *non-differentiable* +# in derivatives.yaml. +# +# Before reading the following, see NOTE [ Autograd Variable Views ] in +# variable.h for details on views that are tracked by autograd, and views that +# are not. +# +# Moreover, these methods return tensors that share storage with inputs, so we +# mark these methods as view ops to support autograd history tracking. +# The sparse tensor ctor output should technically be view of both input indices +# and values tensors, but currently we only support setting as view of a single +# Variable, so it is only view of the values tensor. +# TODO: clone indices in sparse tensor ctor. +# +# For other methods that return outputs that share storage with inputs, i.e., +# `indices()` and `_indices()`. We mark their outputs as non-differentiable, so +# the view relation is not tracked by autograd, but the version counter is still +# shared. In other words, their outputs are non-differentiable views of the +# sparse tensor. +# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given +# the default would never make sense. + +- func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_compressed_tensor_with_dims + +- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_compressed_tensor + +- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + +- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_compressed_tensor +- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + +- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeImplicitAutograd: _sparse_compressed_tensor_unsafe_symint + +- func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + +- func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_coo_tensor + autogen: sparse_coo_tensor.size_out + +- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor + +- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor + +- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor + dispatch: + CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint + +- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> () + +- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> () +- func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () + +- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse + autogen: _sparse_coo_tensor_with_dims.out + +- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor + dispatch: + SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint + autogen: _sparse_coo_tensor_with_dims_and_tensors.out + +- func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: sparse_resize_ + autogen: sparse_resize, sparse_resize.out + +- func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: sparse_resize_and_clear_ + autogen: sparse_resize_and_clear, sparse_resize_and_clear.out + +- func: sparse_mask(Tensor self, Tensor mask) -> Tensor + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_mask + SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed + autogen: sparse_mask.out + +- func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_mask_projection + autogen: _sparse_mask_projection.out + +- func: _to_cpu(Tensor[] tensors) -> Tensor[] + variants: function + +- func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor + variants: method + +# Special case of to_dense with custom derivative +- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_to_dense + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense + MkldnnCPU: mkldnn_to_dense + autogen: _to_dense.out + +- func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor + +- func: sparse_dim(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr + CompositeExplicitAutograd: sparse_dim_default + device_check: NoCheck + device_guard: False + +# legacy method +- func: _dimI(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_dim_sparse + device_check: NoCheck + device_guard: False + +- func: dense_dim(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr + CompositeExplicitAutograd: dense_dim_default + device_check: NoCheck + device_guard: False + +# legacy method +- func: _dimV(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse + device_check: NoCheck + device_guard: False + +- func: _nnz(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr + device_check: NoCheck + device_guard: False + +# NOTE: [ coalesce autograd ] +# coalesce returns self directly for already coalesced sparse tensors. +# This means coalesce cannot have a derivative registered, otherwise it creates +# circular references in the autograd graph (see gh-52874). +# Instead, the derivative is registered on the slow-path "_coalesce" +- func: coalesce(Tensor(a) self) -> Tensor(a) + variants: method + +- func: _coalesce(Tensor self) -> Tensor + dispatch: + SparseCPU: _coalesce_sparse_cpu + SparseCUDA: _coalesce_sparse_cuda + autogen: _coalesce.out + +- func: is_coalesced(Tensor self) -> bool + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: is_coalesced_sparse + CompositeExplicitAutograd: is_coalesced_default + device_check: NoCheck + device_guard: False + +- func: _indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _indices_sparse + device_check: NoCheck + device_guard: False + +- func: _values(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _values_sparse + device_check: NoCheck + device_guard: False + +# This method doesn't do any check but only directly sets the flag. So it can be +# a bit unsafe. Similar to _indices and _values, this is useful for implementing +# custom sparse operations in Python/C++ extension. +- func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _coalesced_sparse_ + device_check: NoCheck + device_guard: False + autogen: _coalesced, _coalesced.out + +- func: indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: indices_sparse + CompositeExplicitAutograd: indices_default + device_check: NoCheck + device_guard: False + +- func: values(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: values_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr + NestedTensorCPU, NestedTensorCUDA: values_nested + CompositeExplicitAutograd: values_default + device_check: NoCheck + device_guard: False + +- func: crow_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr + CompositeExplicitAutograd: crow_indices_default + device_check: NoCheck + device_guard: False + +- func: col_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr + CompositeExplicitAutograd: col_indices_default + device_check: NoCheck + device_guard: False + +- func: ccol_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr + CompositeExplicitAutograd: ccol_indices_default + device_check: NoCheck + device_guard: False + +- func: row_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr + CompositeExplicitAutograd: row_indices_default + device_check: NoCheck + device_guard: False + +- func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + SparseCPU: hspmm_out_sparse_cpu + SparseCUDA: hspmm_out_sparse_cuda + +- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor + dispatch: + SparseCPU: hspmm_sparse_cpu + SparseCUDA: hspmm_sparse_cuda + +- func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) + device_check: NoCheck # Allows copy into different device + variants: function + dispatch: + SparseCPU, SparseCUDA, SparseMeta: copy_sparse_ + autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out + +# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors +- func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[] + variants: function, method + dispatch: + CompositeExplicitAutograd: unbind + NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind + +- func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[] + variants: function, method + +- func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor + variants: method + +# Special case of to_sparse.sparse_dim with custom derivative +- func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse + SparseCPU, SparseCUDA: sparse_coo_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse + autogen: _to_sparse.sparse_dim_out + +- func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor + variants: method + +# Special case of to_sparse with custom derivative +- func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse + SparseCPU, SparseCUDA: sparse_coo_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse + autogen: _to_sparse.out + +- func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor + variants: method + +# Special case of to_sparse_csr with custom derivative +- func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csr + SparseCPU, SparseCUDA: coo_to_sparse_csr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr + autogen: _to_sparse_csr.out + +- func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor + variants: method + +# Special case of to_sparse_csc with custom derivative +- func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csc + SparseCPU, SparseCUDA: coo_to_sparse_csc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc + autogen: _to_sparse_csc.out + +- func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method + +# Special case of to_sparse_bsr with custom derivative +- func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsr + SparseCPU, SparseCUDA: coo_to_sparse_bsr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr + autogen: _to_sparse_bsr.out + +- func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method + +# Special case of to_sparse_bsc with custom derivative +- func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsc + SparseCPU, SparseCUDA: coo_to_sparse_bsc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc + autogen: _to_sparse_bsc.out + +- func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: _to_sparse_semi_structured + +- func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor + variants: method + dispatch: + CPU: dense_to_mkldnn + autogen: to_mkldnn.out + +- func: mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor + variants: function + python_module: nn + dispatch: + MkldnnCPU: mkldnn_reorder_conv2d_weight + autogen: mkldnn_reorder_conv2d_weight.out + +- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor + variants: function + python_module: nn + dispatch: + MkldnnCPU: mkldnn_reorder_conv3d_weight + autogen: mkldnn_reorder_conv3d_weight.out + +- func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor + +- func: quantize_per_tensor_dynamic(Tensor self, ScalarType dtype, bool reduce_range) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_tensor_dynamic + autogen: quantize_per_tensor_dynamic.out + +- func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_tensor + autogen: quantize_per_tensor.out + +- func: quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_tensor_tensor_qparams + autogen: quantize_per_tensor.tensor_qparams_out + +- func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[] + variants: function + dispatch: + CPU: quantize_per_tensor_list_cpu + autogen: quantize_per_tensor.tensors_out + +- func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_channel + autogen: quantize_per_channel.out + +- func: dequantize.self(Tensor self) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: dequantize_cpu_or_cuda + QuantizedCPU, QuantizedCUDA: dequantize_quantized + autogen: dequantize.self_out + +- func: dequantize.tensors(Tensor[] tensors) -> Tensor[] + variants: function + dispatch: + QuantizedCPU: dequantize_tensors_quantized_cpu + autogen: dequantize.tensors_out + +- func: q_scale(Tensor self) -> float + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_scale_quant + +- func: q_zero_point(Tensor self) -> int + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_zero_point_quant + +- func: q_per_channel_scales(Tensor self) -> Tensor + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_per_channel_scales + autogen: q_per_channel_scales.out + +- func: q_per_channel_zero_points(Tensor self) -> Tensor + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points + autogen: q_per_channel_zero_points.out + +- func: q_per_channel_axis(Tensor self) -> int + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_per_channel_axis + +- func: int_repr(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + QuantizedCPU: int_repr_quantized_cpu + QuantizedCUDA: int_repr_quantized_cuda + autogen: int_repr.out + +- func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor + dispatch: + CPU: make_per_tensor_quantized_tensor_cpu + CUDA: make_per_tensor_quantized_tensor_cuda + autogen: _make_per_tensor_quantized_tensor.out + +- func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor + dispatch: + CPU: make_per_channel_quantized_tensor_cpu + CUDA: make_per_channel_quantized_tensor_cuda + autogen: _make_per_channel_quantized_tensor.out + +- func: qscheme(Tensor self) -> QScheme + variants: method + dispatch: + QuantizedCPU, QuantizedCUDA: qscheme_quant + +- func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + +- func: fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + +- func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: fake_quantize_per_tensor_affine_cachemask + autogen: fake_quantize_per_tensor_affine_cachemask.out + +- func: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams + autogen: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out + +- func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor + variants: function + +- func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine + autogen: _fake_quantize_learnable_per_tensor_affine.out + +- func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine_backward + +- func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + +- func: fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: fake_quantize_per_channel_affine_cachemask + autogen: fake_quantize_per_channel_affine_cachemask.out + +- func: fake_quantize_per_channel_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor + variants: function + +- func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine + autogen: _fake_quantize_learnable_per_channel_affine.out + +- func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine_backward + +- func: fused_moving_avg_obs_fake_quant(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> Tensor + variants: function + +- func: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask) + dispatch: + CPU: fused_moving_avg_obs_fake_quant_cpu + CUDA: fused_moving_avg_obs_fake_quant_cuda + autogen: _fused_moving_avg_obs_fq_helper_functional, _fused_moving_avg_obs_fq_helper.out + +- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) + variants: function + +- func: _saturate_weight_to_fp16(Tensor weight) -> Tensor + variants: function + +- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor) + variants: function + +- func: _autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a) + variants: method + device_guard: False + +- func: _autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a) + variants: method + device_guard: False + +- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: _to_copy + NestedTensorCPU, NestedTensorCUDA: _to_copy_nested + autogen: _to_copy.out + tags: core + +# to(Device) must not exist because all constructors of Device also works for +# TensorOptions. Otherwise, an ambiguity error is thrown. +# See NOTE [ TensorOptions Constructors ]. +- func: to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: meshgrid(Tensor[] tensors) -> Tensor[] + +# TODO: Two weeks after this lands, combine these two overloads, +# making "indexing" optional. These are temporarily distinct for +# forward-compatibility reasons. +- func: meshgrid.indexing(Tensor[] tensors, *, str indexing) -> Tensor[] + +- func: cartesian_prod(Tensor[] tensors) -> Tensor + variants: function + +- func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor + variants: function + +- func: item(Tensor self) -> Scalar + tags: data_dependent_output + variants: method + +- func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType + variants: function + +- func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType + variants: function + +- func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType + variants: function + +- func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType + +- func: can_cast(ScalarType from_, ScalarType to) -> bool + variants: function + +- func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType + variants: function + +# NB: Does NOT check precondition that numel == 1 +- func: _local_scalar_dense(Tensor self) -> Scalar + tags: [core, data_dependent_output] + dispatch: + CPU: _local_scalar_dense_cpu + CUDA: _local_scalar_dense_cuda + MPS: _local_scalar_dense_mps + variants: function + +# MPS LSTM implementation + +- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + MPS: _lstm_mps + autogen: _lstm_mps.out + tags: nondeterministic_seeded + +- func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[]) + dispatch: + MPS: lstm_mps_backward + autogen: lstm_mps_backward.out + + +# Fused RNN kernels +- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_lstm_cell_cuda + autogen: _thnn_fused_lstm_cell.out + +# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs +# It is necessary to avoid triggering TensorImpl use count checks in debug mode +# NB: this is function is NOT differentiable +- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_lstm_cell_backward_impl_cuda + autogen: _thnn_fused_lstm_cell_backward_impl.out + +- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + +- func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + +- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_gru_cell_cuda + autogen: _thnn_fused_gru_cell.out + +- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_gru_cell_backward_cuda + autogen: _thnn_fused_gru_cell_backward.out + +- func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + +# RNN cells and layers +- func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) + tags: nondeterministic_seeded + +- func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor) + tags: nondeterministic_seeded + +- func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor) + +- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + +- func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + +- func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + +# Quantized RNN layer registration has been moved to C10 dispatch in `RNN.cpp` + +# Quantized RNN layers +# - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) + + +# - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) + + +# Quantized GRU layers + +# - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) +# + +# - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) +# + +# Quantized RNN cells +- func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor) + +- func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor + +- func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor + +- func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor + +# PackedSequence utilities +- func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _pack_padded_sequence + autogen: _pack_padded_sequence.out + +- func: _pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor + dispatch: + CompositeImplicitAutograd: _pack_padded_sequence_backward_symint + +- func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor) + +# wrappers for legacy TH methods + +- func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, MPS: set_ + autogen: set.source_Storage, set.source_Storage_out + tags: inplace_view + +- func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU: set_storage_cpu_ + Meta: set_storage_meta__symint + CUDA: set_storage_cuda_ + MPS: set_storage_mps_ + QuantizedCPU, QuantizedCUDA: set_storage_quantized_ + autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out + tags: inplace_view + +- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: set__symint + tags: inplace_view + +- func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, MPS: set_tensor_ + autogen: set.source_Tensor, set.source_Tensor_out + tags: inplace_view + +- func: set_(Tensor(a!) self) -> Tensor(a!) + variants: method + dispatch: + CPU: set_cpu_ + CUDA: set_cuda_ + Meta: set_meta_ + MPS: set_mps_ + autogen: set, set.out + tags: inplace_view + +# Not making it CompositeImplicitAutograd because lift +# should be a primitive w.r.t. functorch + +# TODO: this should have a view annotation +# TODO: shouldn't be a method +- func: lift(Tensor self) -> Tensor + dispatch: + CompositeExplicitAutograd: lift + autogen: lift.out + +# lift_fresh is called with an argument that is guaranteed to be +# fresh (i.e., newly allocated). This is ONLY called from a +# torch.tensor call; if you FX trace a lift_fresh, you are obligated +# to convert this into a lift_fresh_copy (because FX will violate the +# freshness invariant when tracing). +- func: lift_fresh(Tensor(a) self) -> Tensor(a) + dispatch: + CompositeExplicitAutograd: lift_fresh + +# Like lift, but it clones the input. +- func: lift_fresh_copy(Tensor self) -> Tensor + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: lift_fresh_copy + autogen: lift_fresh_copy.out + +- func: is_set_to(Tensor self, Tensor tensor) -> bool + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, MPS: is_set_to + +- func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU: masked_fill__cpu + CUDA: masked_fill__cuda + QuantizedCPU: masked_fill__quantized_cpu + QuantizedCUDA: masked_fill__quantized_cuda + MPS: masked_fill__mps + autogen: masked_fill.Scalar_out + +- func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: masked_fill + NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill + tags: pointwise + +- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU: masked_fill__cpu + CUDA: masked_fill__cuda + QuantizedCPU: masked_fill__quantized_cpu + QuantizedCUDA: masked_fill__quantized_cuda + MPS: masked_fill__mps + autogen: masked_fill.Tensor_out + +- func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: masked_fill + +- func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!) + variants: method + dispatch: + CPU: masked_scatter__cpu + CUDA: masked_scatter__cuda + MPS: masked_scatter__mps + autogen: masked_scatter.out + +- func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: masked_scatter + +- func: masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor + dispatch: + CompositeExplicitAutograd: masked_scatter_backward_symint + +- func: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor + dispatch: + CUDA: masked_softmax_cuda + CPU: masked_softmax_cpu + autogen: _masked_softmax.out + +- func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor + dispatch: + CUDA: masked_softmax_backward_cuda + CPU: masked_softmax_backward_cpu + autogen: _masked_softmax_backward.out + +- func: view(Tensor(a) self, SymInt[] size) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view + MkldnnCPU: mkldnn_view + NestedTensorCPU, NestedTensorCUDA: view_nested + tags: core + +# Warning: If you want to change the name or overload name of this +# operator, you might also want to change the `isBlockListedSchema` +# function in `torch/csrc/jit/frontend/schema_catching.cpp`. +# The name and overload name of this operator is hardcoded in that +# function in order to workaround a bug: +# https://github.com/pytorch/pytorch/issues/47964 +- func: view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: view_dtype + +- func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!) + variants: method + dispatch: + CPU, CUDA: put_ + autogen: put.out + +- func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: put + +- func: index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU: index_add_cpu_out + CUDA: index_add_cuda_out + MPS: index_add_mps_out + +- func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!) + structured_delegate: index_add.out + variants: method + +- func: index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor + structured_delegate: index_add.out + variants: function, method + +- func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor + variants: function, method + +- func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU: index_reduce_cpu_out + CUDA: index_reduce_cuda_out + +- func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: index_reduce.out + variants: method + +- func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: index_reduce.out + variants: function, method + +- func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU: index_fill_ + CUDA: index_fill_ + MPS: index_fill_mps_ + autogen: index_fill.int_Scalar_out + +- func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_fill + +- func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: index_fill_ + MPS: index_fill_mps_ + autogen: index_fill.int_Tensor_out + +- func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_fill + +- func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor + structured_delegate: scatter.src_out + variants: function, method + tags: core + +- func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + structured_delegate: scatter.src_out + variants: method + +- func: scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_src_out + MPS: scatter_src_out_mps + +- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor + structured_delegate: scatter.value_out + variants: function, method + tags: core + +- func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) + structured_delegate: scatter.value_out + variants: method + +- func: scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_value_out + MPS: scatter_value_out_mps + +- func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor + structured_delegate: scatter.reduce_out + variants: function, method + +- func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!) + structured_delegate: scatter.reduce_out + variants: method + +- func: scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_reduce_out + MPS: scatter_reduce_out_mps + +- func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor + structured_delegate: scatter.value_reduce_out + variants: function, method + +- func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!) + structured_delegate: scatter.value_reduce_out + variants: method + +- func: scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_value_reduce_out + MPS: scatter_value_reduce_out_mps + +- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor + variants: function, method + +- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor + variants: function, method + +- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor + structured_delegate: scatter_add.out + variants: function, method + tags: core + +- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + structured_delegate: scatter_add.out + variants: method + +- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_add + MPS: scatter_add_mps_out + +- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor + variants: function, method + +- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: scatter_reduce.two_out + variants: function, method + tags: core + +- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: scatter_reduce.two_out + variants: method + +- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_reduce_two + +- func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: eq.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: eq.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +- func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: bitwise_and_out + MPS: bitwise_and_out_mps + tags: pointwise + +- func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_and_out + tags: pointwise + +- func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_and + tags: [core, pointwise] + +- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_and + autogen: bitwise_and.Scalar_Tensor_out + tags: pointwise + +- func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: bitwise_and.Tensor_out + tags: [core, pointwise] + +- func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: bitwise_and_ + tags: pointwise + +- func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_and.Tensor_out + tags: pointwise + +- func: __and__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __and__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: bitwise_or_out + MPS: bitwise_or_out_mps + tags: pointwise + +- func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_or_out + tags: pointwise + +- func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_or + tags: [core, pointwise] + +- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_or + autogen: bitwise_or.Scalar_Tensor_out + tags: pointwise + +- func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: bitwise_or.Tensor_out + tags: [core, pointwise] + +- func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: bitwise_or_ + tags: pointwise + +- func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_or.Tensor_out + tags: pointwise + +- func: __or__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __or__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: bitwise_xor_out + MPS: bitwise_xor_out_mps + tags: pointwise + +- func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_xor_out + tags: pointwise + +- func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_xor + tags: [core, pointwise] + +- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_xor + autogen: bitwise_xor.Scalar_Tensor_out + tags: pointwise + +- func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: bitwise_xor.Tensor_out + tags: [core, pointwise] + +- func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: bitwise_xor_ + tags: pointwise + +- func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_xor.Tensor_out + tags: pointwise + +- func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __lshift__ + tags: pointwise + +- func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __lshift__ + tags: pointwise + +- func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __ilshift__ + autogen: __lshift__.Scalar_out + tags: pointwise + +- func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __ilshift__ + autogen: __lshift__.Tensor_out + tags: pointwise + +- func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: bitwise_left_shift.Tensor_out + tags: pointwise + +- func: bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_left_shift.Tensor_out + tags: pointwise + +- func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_left_shift_out + tags: pointwise + +- func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_left_shift + tags: pointwise + +- func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: bitwise_left_shift_ + tags: pointwise + +- func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_left_shift_out + tags: pointwise + +- func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_left_shift + autogen: bitwise_left_shift.Scalar_Tensor_out + tags: pointwise + +- func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __rshift__ + tags: pointwise + +- func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __rshift__ + tags: pointwise + +- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __irshift__ + autogen: __rshift__.Scalar_out + +- func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __irshift__ + autogen: __rshift__.Tensor_out + +- func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: bitwise_right_shift.Tensor_out + tags: pointwise + +- func: bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_right_shift.Tensor_out + tags: pointwise + +- func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_right_shift_out + tags: pointwise + +- func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_right_shift + tags: pointwise + +- func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: bitwise_right_shift_ + tags: pointwise + +- func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_right_shift_out + tags: pointwise + +- func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_right_shift + autogen: bitwise_right_shift.Scalar_Tensor_out + tags: pointwise + +- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) + structured_delegate: tril.out + variants: method + +- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) + structured_delegate: triu.out + variants: method + +- func: digamma_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: digamma.out + variants: method + tags: pointwise + +- func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: lerp.Scalar_out + tags: pointwise + +- func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: lerp.Tensor_out + tags: pointwise + +- func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + variants: method + dispatch: + CPU, CUDA: addbmm_ + MPS: addbmm_mps_ + +- func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addbmm_out + MPS: addbmm_out_mps + +- func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: addbmm + MPS: addbmm_mps + +- func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: random_ + Meta: random_meta_ + MPS: random_mps_ + autogen: random.from, random.from_out + +- func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: random_ + Meta: random_meta_ + MPS: random_mps_ + autogen: random.to, random.to_out + +- func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: random_ + MPS: random_mps_ + Meta: random_meta_ + autogen: random, random.out + +- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: uniform_ + MPS: uniform_mps_ + Meta: uniform_meta_ + autogen: uniform, uniform.out + +- func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: cauchy_ + autogen: cauchy, cauchy.out + +- func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: log_normal_ + autogen: log_normal, log_normal.out + +- func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: exponential_ + MPS: exponential_mps_ + autogen: exponential, exponential.out + +- func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: geometric_ + + # wrappers for TH functions + autogen: geometric, geometric.out + +- func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: diag(Tensor self, int diagonal=0) -> Tensor + variants: method, function + +- func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + +- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor + variants: method, function + +- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: triu_cpu + CUDA: triu_cuda + MPS: triu_mps_out + +- func: triu(Tensor self, int diagonal=0) -> Tensor + structured_delegate: triu.out + variants: method, function + +- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: tril_cpu + CUDA: tril_cuda + MPS: tril_mps_out + +- func: tril(Tensor self, int diagonal=0) -> Tensor + structured_delegate: tril.out + variants: method, function + +- func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: tril_indices_cpu + CUDA: tril_indices_cuda + autogen: tril_indices.out + +- func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: triu_indices_cpu + CUDA: triu_indices_cuda + autogen: triu_indices.out + +- func: trace(Tensor self) -> Tensor + variants: method, function + dispatch: + CPU: trace_cpu + CUDA: trace_cuda + MPS: trace_mps + autogen: trace.out + +- func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: trace_backward_symint + +- func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ne_Scalar_out + MPS: ne_scalar_out_mps + QuantizedCPU: ne_out_quantized_cpu + tags: pointwise + +- func: ne.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: ne.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ne_quantized_cpu + tags: [core, pointwise] + +- func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ne_Tensor_out + MPS: ne_tensor_out_mps + QuantizedCPU: ne_out_quantized_cpu + tags: pointwise + +- func: ne.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: ne.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ne_quantized_cpu + tags: [core, pointwise] + +- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: ne.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: ne.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# not_equal, alias for torch.ne +- func: not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: eq_Scalar_out + MPS: eq_scalar_out_mps + QuantizedCPU: eq_out_quantized_cpu + tags: pointwise + +- func: eq.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: eq.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: eq_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested + tags: [core, pointwise] + +- func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: eq_Tensor_out + MPS: eq_tensor_out_mps + QuantizedCPU: eq_out_quantized_cpu + tags: pointwise + +- func: eq.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: eq.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: eq_quantized_cpu + tags: [core, pointwise] + +- func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ge_Scalar_out + MPS: ge_scalar_out_mps + QuantizedCPU: ge_out_quantized_cpu + tags: pointwise + +- func: ge.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: ge.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ge_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested + tags: [core, pointwise] + +- func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ge_Tensor_out + MPS: ge_tensor_out_mps + QuantizedCPU: ge_out_quantized_cpu + tags: pointwise + +- func: ge.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: ge.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ge_quantized_cpu + tags: [core, pointwise] + +- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: ge.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: ge.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# greater_equal, alias for torch.ge +- func: greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: le_Scalar_out + MPS: le_scalar_out_mps + QuantizedCPU: le_out_quantized_cpu + tags: pointwise + +- func: le.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: le.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: le_quantized_cpu + tags: [core, pointwise] + +- func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: le_Tensor_out + MPS: le_tensor_out_mps + QuantizedCPU: le_out_quantized_cpu + tags: pointwise + +- func: le.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: le.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: le_quantized_cpu + tags: [core, pointwise] + +- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: le.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: le.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# less_equal, alias for torch.le +- func: less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: gt_Scalar_out + MPS: gt_scalar_out_mps + QuantizedCPU: gt_out_quantized_cpu + tags: pointwise + +- func: gt.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: gt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: gt_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested + tags: [core, pointwise] + +- func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: gt_Tensor_out + MPS: gt_tensor_out_mps + QuantizedCPU: gt_out_quantized_cpu + tags: pointwise + +- func: gt.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: gt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: gt_quantized_cpu + tags: [core, pointwise] + +- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: gt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: gt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# greater, alias for torch.gt +- func: greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: lt_Scalar_out + MPS: lt_scalar_out_mps + QuantizedCPU: lt_out_quantized_cpu + tags: pointwise + +- func: lt.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: lt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: lt_quantized_cpu + tags: [core, pointwise] + +- func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: lt_Tensor_out + MPS: lt_tensor_out_mps + QuantizedCPU: lt_out_quantized_cpu + tags: pointwise + +- func: lt.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: lt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: lt_quantized_cpu + tags: [core, pointwise] + +- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: lt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: lt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# less, alias for torch.lt +- func: less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: take_out + +- func: take(Tensor self, Tensor index) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: take + +- func: take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + +- func: take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor + variants: method, function + +- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, QuantizedCPU: index_select_out_cpu_ + CUDA, QuantizedCUDA: index_select_out_cuda + MPS: index_select_out_mps + +- func: index_select(Tensor self, int dim, Tensor index) -> Tensor + variants: method, function + dispatch: + CPU: index_select_cpu_ + QuantizedCPU: index_select_quantized_cpu_ + CUDA: index_select_cuda + QuantizedCUDA: index_select_quantized_cuda + SparseCPU: index_select_sparse_cpu + SparseCUDA: index_select_sparse_cuda + MPS: index_select_mps + tags: core + +- func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) + +- func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor + variants: method, function + +- func: index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: index_select_backward_symint + +- func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: masked_select_out_cpu + CUDA: masked_select_out_cuda + MPS: masked_select_out_mps + tags: dynamic_output_shape + +- func: masked_select(Tensor self, Tensor mask) -> Tensor + variants: method, function + dispatch: + CPU: masked_select_cpu + CUDA: masked_select_cuda + MPS: masked_select_mps + tags: dynamic_output_shape + +- func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: nonzero_out_cpu + CUDA: nonzero_out_cuda + MPS: nonzero_out_mps + tags: dynamic_output_shape + +- func: nonzero(Tensor self) -> Tensor + variants: method, function + dispatch: + CPU: nonzero_cpu + CUDA: nonzero_cuda + MPS: nonzero_mps + tags: [dynamic_output_shape, core] + +- func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: nonzero_static_out_cpu + +- func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor + variants: method, function + dispatch: + CPU: nonzero_static_cpu + +- func: nonzero_numpy(Tensor self) -> Tensor[] + variants: method, function + +- func: argwhere(Tensor self) -> Tensor + variants: method, function + tags: dynamic_output_shape + +- func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: gather_out + MPS: gather_out_mps + +- func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor + variants: method, function + structured_delegate: gather.out + tags: core + +- func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) + +- func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor + variants: method, function + +- func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor + +- func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: addcmul_out + MPS: addcmul_out_mps + tags: pointwise + +- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor + structured_delegate: addcmul.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + structured_delegate: addcmul.out + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: addcdiv_out + MPS: addcdiv_out_mps + tags: pointwise + +- func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor + structured_delegate: addcdiv.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + structured_delegate: addcdiv.out + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: cross_entropy_loss_symint + +- func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient) + structured: True + dispatch: + CPU, CUDA: triangular_solve_out + MPS: triangular_solve_mps_out + SparseCsrCPU: triangular_solve_out_sparse_csr_cpu + SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda + +- func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient) + structured_delegate: triangular_solve.X + variants: method, function + +- func: _linalg_check_errors(Tensor info, str api_name, *, bool is_matrix) -> () + dispatch: + CompositeExplicitAutograd: _linalg_check_errors + +- func: linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + dispatch: + CPU, CUDA: linalg_solve_triangular_out + MPS: linalg_solve_triangular_mps_out + +- func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_solve_triangular + MPS: linalg_solve_triangular_mps + +- func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor + python_module: linalg + dispatch: + CompositeImplicitAutograd: linalg_vander_symint + +- func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) + +- func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V) + variants: method, function + +# swapaxes, alias for transpose +- func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + +# swapdims, alias for transpose +- func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + +- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cholesky_out + +- func: cholesky(Tensor self, bool upper=False) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: cholesky + +- func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: cholesky_solve_out + +- func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor + variants: method, function + dispatch: + CompositeExplicitAutograd: cholesky_solve + +- func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor + variants: function + dispatch: + CPU: _cholesky_solve_helper_cpu + CUDA: _cholesky_solve_helper_cuda + autogen: _cholesky_solve_helper.out + +- func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: cholesky_inverse + +- func: cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cholesky_inverse_out + +- func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) + +- func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R) + variants: method, function + +- func: geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau) + dispatch: + CPU, CUDA: geqrf_out + +- func: geqrf(Tensor self) -> (Tensor a, Tensor tau) + variants: method, function + dispatch: + CPU, CUDA: geqrf + +# orgqr, alias for linalg_householder_product +- func: orgqr(Tensor self, Tensor input2) -> Tensor + variants: method, function + +- func: orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!) + +- func: ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: ormqr_out + +- func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: ormqr + +- func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info) + variants: function + +- func: lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!) + +- func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor + variants: method, function + +# lu_unpack +- func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U) + structured_delegate: lu_unpack.out + variants: function + +- func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) + variants: function + structured: True + dispatch: + CPU, CUDA: lu_unpack_out + +# TODO: remove dispatch section when porting TH CUDA to ATen +- func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: multinomial_out + MPS: multinomial_out_mps + +- func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: multinomial + MPS: multinomial_mps + tags: nondeterministic_seeded + +- func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lgamma_out + MPS: lgamma_out_mps + tags: pointwise + +- func: lgamma_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: lgamma.out + variants: method + tags: pointwise + +- func: lgamma(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: lgamma.out + variants: method, function + tags: pointwise + +- func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: digamma_out + MPS: digamma_out_mps + tags: pointwise + +- func: digamma(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: digamma.out + variants: method, function + tags: pointwise + +- func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: polygamma_out + MPS: polygamma_out_mps + tags: pointwise + +- func: polygamma(int n, Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: polygamma.out + variants: method, function + tags: pointwise + +- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: polygamma_ + tags: pointwise + +- func: erfinv(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: erfinv.out + variants: method, function + dispatch: + SparseCPU, SparseCUDA: erfinv_sparse + SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr + tags: pointwise + +- func: erfinv_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: erfinv.out + variants: method + dispatch: + SparseCPU, SparseCUDA: erfinv_sparse_ + SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_ + tags: pointwise + +- func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: erfinv_out + MPS: erfinv_out_mps + SparseCPU, SparseCUDA: erfinv_sparse_out + SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out + tags: pointwise + +- func: i0(Tensor self) -> Tensor + structured_delegate: i0.out + variants: function, method + tags: pointwise + +- func: i0_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: i0.out + variants: function, method + tags: pointwise + +- func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: i0_out + tags: pointwise + +- func: sign(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sign.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sign_sparse + SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr + tags: [core, pointwise] + +- func: sign_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sign.out + variants: method + dispatch: + SparseCPU, SparseCUDA: sign_sparse_ + SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_ + tags: pointwise + +- func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sign_out + MPS: sign_out_mps + SparseCPU, SparseCUDA: sign_sparse_out + SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out + tags: pointwise + +- func: signbit(Tensor self) -> Tensor + variants: function, method + structured_delegate: signbit.out + dispatch: + SparseCPU, SparseCUDA: signbit_sparse + SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr + tags: pointwise + +- func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU: signbit_out + CUDA: signbit_out + MPS: signbit_out_mps + SparseCPU, SparseCUDA: signbit_sparse_out + SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out + tags: pointwise + +- func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: dist + autogen: dist.out + +- func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: atan2_out + MPS: atan2_out_mps + tags: [core, pointwise] + +- func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: atan2.out + variants: method + tags: pointwise + +- func: atan2(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: atan2.out + variants: method, function + tags: [core, pointwise] +# arctan2, alias of atan2 + +- func: arctan2(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lerp_Scalar + MPS: lerp_Scalar_mps + tags: pointwise + +- func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lerp_Tensor + MPS: lerp_Tensor_mps + tags: pointwise + +- func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: lerp.Scalar_out + tags: pointwise + +- func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: lerp.Tensor_out + tags: pointwise + +- func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, MPS: histogram_histc_out + CUDA: _histc_out_cuda + +- func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor + variants: method, function + dispatch: + CPU, MPS: histogram_histc + CUDA: _histc_cuda + +- func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) + dispatch: + CPU, MPS: histogram_out + +- func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) + variants: method, function + dispatch: + CPU, MPS: histogram + +- func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) + dispatch: + CPU, MPS: histogram_out + +- func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) + variants: method, function + dispatch: + CPU, MPS: histogram + +- func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[] + dispatch: + CPU, MPS: histogramdd_bin_edges + autogen: _histogramdd_bin_edges.out + +- func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor + dispatch: + CPU, MPS: _histogramdd + autogen: _histogramdd_from_bin_cts.out + +- func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor + dispatch: + CPU, MPS: _histogramdd + autogen: _histogramdd_from_bin_tensors.out + +- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: fmod_out + tags: pointwise + +- func: fmod.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: fmod + tags: [core, pointwise] + +- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: fmod_ + tags: pointwise + +- func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: fmod_out + MPS: fmod_mps_out + tags: pointwise + +- func: fmod.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: fmod.Tensor_out + variants: method, function + tags: [core, pointwise] + +- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: fmod.Tensor_out + tags: pointwise + +- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: hypot_out + MPS: hypot_out_mps + tags: pointwise + +- func: hypot(Tensor self, Tensor other) -> Tensor + structured_delegate: hypot.out + variants: method, function + tags: pointwise + +- func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: hypot.out + variants: method + tags: pointwise + +- func: igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: igamma_out + tags: pointwise + +- func: igamma(Tensor self, Tensor other) -> Tensor + structured_delegate: igamma.out + variants: method, function + tags: pointwise + +- func: igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: igamma.out + variants: method + tags: pointwise + +- func: igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: igammac_out + tags: pointwise + +- func: igammac(Tensor self, Tensor other) -> Tensor + structured_delegate: igammac.out + variants: method, function + tags: pointwise + +- func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: igammac.out + variants: method + tags: pointwise + +- func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA, MPS: nextafter_out + tags: pointwise + +- func: nextafter(Tensor self, Tensor other) -> Tensor + structured_delegate: nextafter.out + variants: method, function + tags: pointwise + +- func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: nextafter.out + variants: method + tags: pointwise + +- func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: remainder_out + tags: pointwise + +- func: remainder.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + dispatch: + CompositeExplicitAutograd: remainder + tags: [core, pointwise] + +- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: remainder_ + tags: pointwise + +- func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: remainder_out + MPS: remainder_out_mps + tags: pointwise + +- func: remainder.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: remainder.Tensor_out + variants: method, function + tags: [core, pointwise] + +- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: remainder.Tensor_out + variants: method + tags: pointwise + +- func: remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA, MPS: remainder + autogen: remainder.Scalar_Tensor_out + tags: pointwise + +- func: min(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: min + MPS: min_mps + QuantizedCPU: min_quantized_cpu + +- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: min_unary_out + QuantizedCPU: min_quantized_unary_out + +- func: fmin(Tensor self, Tensor other) -> Tensor + structured_delegate: fmin.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA, MPS: fmin_out + tags: pointwise + +- func: max(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: max + MPS: max_mps + QuantizedCPU: max_quantized_cpu + +- func: fmax(Tensor self, Tensor other) -> Tensor + structured_delegate: fmax.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA, MPS: fmax_out + tags: pointwise + +- func: maximum(Tensor self, Tensor other) -> Tensor + structured_delegate: maximum.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: [core, pointwise] + +- func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: maximum_out + MPS: maximum_out_mps + tags: pointwise + +# binary max, alias of maximum +# NOTE: max is not an alias for maximum, since there is also unary max +- func: max.other(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: pointwise + +- func: max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: max_unary_out + QuantizedCPU: max_quantized_unary_out + +- func: minimum(Tensor self, Tensor other) -> Tensor + structured_delegate: minimum.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: [core, pointwise] + +- func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: minimum_out + MPS: minimum_out_mps + tags: pointwise + +# binary min, alias for minimum +# NOTE: min is not an alias for minimum, since there is also unary min +- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: pointwise + +- func: min.other(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: sort_out + +- func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + structured: True + dispatch: + CPU, CUDA: sort_stable_out + MPS: sort_stable_out_mps + +- func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: sort + tags: core + +- func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) + structured_delegate: sort.values_stable + variants: method, function + dispatch: + QuantizedCPU: sort_quantized_cpu_stable + +- func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices) + variants: method, function + +- func: sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices) + variants: method, function + +- func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: msort(Tensor self) -> Tensor + variants: method, function + +- func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA, MPS: argsort_stable + autogen: argsort.stable_out + +- func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor + variants: method, function + +- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + structured: True + dispatch: + CPU: topk_out_cpu + CUDA: topk_out_cuda + MPS: topk_out_mps + +- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) + variants: method, function + structured_delegate: topk.values + dispatch: + QuantizedCPU: topk_quantized_cpu + tags: core + +- func: all(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: all.all_out + variants: method, function + +- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True + dispatch: + CPU, CUDA: all_all_out + MPS: all_all_out_mps + +- func: any(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: any.all_out + variants: method, function + dispatch: + SparseCPU, SparseCUDA: any_sparse + tags: core + +- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True + dispatch: + CPU, CUDA: any_all_out + MPS: any_all_out_mps + +- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: renorm_out + MPS: renorm_out_mps + +- func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: renorm.out + +- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: renorm.out + +- func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, MPS: unfold + QuantizedCPU, QuantizedCUDA: unfold + +- func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor + variants: function + dispatch: + CPU, CUDA: unfold_backward + autogen: unfold_backward.out + +- func: equal(Tensor self, Tensor other) -> bool + tags: [data_dependent_output, pointwise] + variants: method, function + dispatch: + CPU: cpu_equal + CUDA: cuda_equal + MPS: mps_equal + QuantizedCPU: equal_quantized_cpu + +- func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: pow_Tensor_Tensor_out + MPS: pow_tensor_tensor_out_mps + tags: pointwise + +- func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Tensor_out + variants: method, function + tags: [core, pointwise] + +- func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: pow_Scalar_out + MPS: pow_Scalar_out_mps + tags: pointwise + +- func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: pow.Scalar_out + tags: [core, pointwise] + +- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: pow_Tensor_Scalar_out + SparseCPU, SparseCUDA: pow_out_sparse_scalar + MPS: pow_tensor_scalar_out_mps + tags: pointwise + +- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Scalar_out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: pow_sparse_scalar + tags: [core, pointwise] + +- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Scalar_out + variants: method + tags: pointwise + +- func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Tensor_out + variants: method + tags: pointwise + +- func: float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor + variants: function, method + tags: pointwise + +- func: float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor + tags: pointwise + +- func: float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor + variants: function, method + tags: pointwise + +- func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) + variants: method + tags: pointwise + +- func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) + variants: method + tags: pointwise + +- func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: normal_ + MPS: normal_mps_ + Meta: normal_meta_ + SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: normal_nested_ + autogen: normal.out + +# Only used by the functionalization pass. +# Normally, the codegen would be able to generate a normal() NativeFunction, +# but we can't due to overload ambiguity with normal.Tensor_float. +- func: normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: normal_functional + +- func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: normal_out + MPS: normal_mps_out + Meta: normal_out_meta + +- func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal + MPS: normal_mps + Meta: normal_meta + tags: nondeterministic_seeded + +- func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out + tags: nondeterministic_seeded + +- func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal + MPS: normal_mps + Meta: normal_meta + tags: nondeterministic_seeded + +- func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out + tags: nondeterministic_seeded + +- func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal + MPS: normal_mps + Meta: normal_meta + tags: nondeterministic_seeded + +- func: normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: normal + tags: nondeterministic_seeded + +- func: normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: normal_out + tags: nondeterministic_seeded + +- func: alias(Tensor(a) self) -> Tensor(a) + variants: method, function + dispatch: + CompositeExplicitAutograd: alias + NestedTensorCPU, NestedTensorCUDA: alias_nested + tags: core + +- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () + variants: function + dispatch: + CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ + CPU: _amp_foreach_non_finite_check_and_unscale_cpu_ + autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out + +- func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!) + variants: function + dispatch: + CUDA: _amp_update_scale_cuda_ + CPU: _amp_update_scale_cpu_ + autogen: _amp_update_scale, _amp_update_scale.out + + #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor + #dispatch: + #CPU: _cat_cpu + #CUDA: cat_cuda + #MPS: cat_mps + #QuantizedCPU: cat_quantized_cpu + + #- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + #dispatch: + #CPU: _cat_out_cpu + #CUDA: cat_out_cuda + #QuantizedCPU: cat_out_quantized_cpu + +- func: _foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow + CUDA: foreach_tensor_add_scalar_kernel_cuda + +- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_ + CUDA: foreach_tensor_add_scalar_kernel_cuda_ + autogen: _foreach_add.Scalar_out + +- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow + CUDA: foreach_tensor_add_list_kernel_cuda + +- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_ + CUDA: foreach_tensor_add_list_kernel_cuda_ + autogen: _foreach_add.List_out + +- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow + CUDA: foreach_tensor_add_scalarlist_kernel_cuda + +- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow_ + CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ + autogen: _foreach_add.ScalarList_out + +- func: _foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow + CUDA: foreach_tensor_add_tensor_kernel_cuda + +- func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_ + CUDA: foreach_tensor_add_tensor_kernel_cuda_ + autogen: _foreach_add.Tensor_out + +- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow + CUDA: foreach_tensor_sub_scalar_kernel_cuda + +- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow_ + CUDA: foreach_tensor_sub_scalar_kernel_cuda_ + autogen: _foreach_sub.Scalar_out + +- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow + CUDA: foreach_tensor_sub_list_kernel_cuda + +- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow_ + CUDA: foreach_tensor_sub_list_kernel_cuda_ + autogen: _foreach_sub.List_out + +- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda + +- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow_ + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ + autogen: _foreach_sub.ScalarList_out + +- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow + CUDA: foreach_tensor_mul_scalar_kernel_cuda + +- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_ + CUDA: foreach_tensor_mul_scalar_kernel_cuda_ + autogen: _foreach_mul.Scalar_out + +- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow + CUDA: foreach_tensor_mul_list_kernel_cuda + +- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_ + CUDA: foreach_tensor_mul_list_kernel_cuda_ + autogen: _foreach_mul.List_out + +- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda + +- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow_ + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ + autogen: _foreach_mul.ScalarList_out + +- func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow + CUDA: foreach_tensor_mul_tensor_kernel_cuda + +- func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_ + CUDA: foreach_tensor_mul_tensor_kernel_cuda_ + autogen: _foreach_mul.Tensor_out + +- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow + CUDA: foreach_tensor_div_scalar_kernel_cuda + +- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow_ + CUDA: foreach_tensor_div_scalar_kernel_cuda_ + autogen: _foreach_div.Scalar_out + +- func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow + CUDA: foreach_tensor_div_list_kernel_cuda + +- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_ + CUDA: foreach_tensor_div_list_kernel_cuda_ + autogen: _foreach_div.List_out + +- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow + CUDA: foreach_tensor_div_scalarlist_kernel_cuda + +- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow_ + CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ + autogen: _foreach_div.ScalarList_out + +- func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow + CUDA: foreach_tensor_div_tensor_kernel_cuda + +- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_ + CUDA: foreach_tensor_div_tensor_kernel_cuda_ + autogen: _foreach_div.Tensor_out + +- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda + +- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ + autogen: _foreach_clamp_max.Scalar_out + +- func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow + CUDA: foreach_tensor_clamp_max_list_kernel_cuda + +- func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_ + CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ + autogen: _foreach_clamp_max.List_out + +- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda + +- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ + autogen: _foreach_clamp_max.ScalarList_out + +- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda + +- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ + autogen: _foreach_clamp_min.Scalar_out + +- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow + CUDA: foreach_tensor_clamp_min_list_kernel_cuda + +- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_ + CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ + autogen: _foreach_clamp_min.List_out + +- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda + +- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ + autogen: _foreach_clamp_min.ScalarList_out + +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda + +- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ + autogen: _foreach_maximum.Scalar_out + +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow + CUDA: foreach_tensor_clamp_min_list_kernel_cuda + +- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_ + CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ + autogen: _foreach_maximum.List_out + +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda + +- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ + autogen: _foreach_maximum.ScalarList_out + +- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda + +- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ + autogen: _foreach_minimum.Scalar_out + +- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow + CUDA: foreach_tensor_clamp_max_list_kernel_cuda + +- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_ + CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ + autogen: _foreach_minimum.List_out + +- func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda + +- func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ + autogen: _foreach_minimum.ScalarList_out + +- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow + CUDA: foreach_tensor_addcdiv_scalar_cuda + +- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow + CUDA: foreach_tensor_addcdiv_scalarlist_cuda + +- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow + CUDA: foreach_tensor_addcdiv_tensor_cuda + +- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow_ + CUDA: foreach_tensor_addcdiv_scalar_cuda_ + autogen: _foreach_addcdiv.Scalar_out + +- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow_ + CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ + autogen: _foreach_addcdiv.ScalarList_out + +- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow_ + CUDA: foreach_tensor_addcdiv_tensor_cuda_ + autogen: _foreach_addcdiv.Tensor_out + +- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow + CUDA: foreach_tensor_addcmul_scalar_cuda + +- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow + CUDA: foreach_tensor_addcmul_scalarlist_cuda + +- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow + CUDA: foreach_tensor_addcmul_tensor_cuda + +- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_ + CUDA: foreach_tensor_addcmul_scalar_cuda_ + autogen: _foreach_addcmul.Scalar_out + +- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow_ + CUDA: foreach_tensor_addcmul_scalarlist_cuda_ + autogen: _foreach_addcmul.ScalarList_out + +- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow_ + CUDA: foreach_tensor_addcmul_tensor_cuda_ + autogen: _foreach_addcmul.Tensor_out + +- func: _foreach_abs(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_abs_slow + CUDA: foreach_tensor_abs_cuda + +- func: _foreach_abs_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_abs_slow_ + CUDA: foreach_tensor_abs_cuda_ + autogen: _foreach_abs.out + +- func: _foreach_acos(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_acos_slow + CUDA: foreach_tensor_acos_cuda + +- func: _foreach_acos_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_acos_slow_ + CUDA: foreach_tensor_acos_cuda_ + autogen: _foreach_acos.out + +- func: _foreach_asin(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_asin_slow + CUDA: foreach_tensor_asin_cuda + +- func: _foreach_asin_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_asin_slow_ + CUDA: foreach_tensor_asin_cuda_ + autogen: _foreach_asin.out + +- func: _foreach_atan(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_atan_slow + CUDA: foreach_tensor_atan_cuda + +- func: _foreach_atan_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_atan_slow_ + CUDA: foreach_tensor_atan_cuda_ + autogen: _foreach_atan.out + +- func: _foreach_ceil(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_ceil_slow + CUDA: foreach_tensor_ceil_cuda + +- func: _foreach_ceil_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_ceil_slow_ + CUDA: foreach_tensor_ceil_cuda_ + autogen: _foreach_ceil.out + +- func: _foreach_cos(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_cos_slow + CUDA: foreach_tensor_cos_cuda + +- func: _foreach_cos_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_cos_slow_ + CUDA: foreach_tensor_cos_cuda_ + autogen: _foreach_cos.out + +- func: _foreach_cosh(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_cosh_slow + CUDA: foreach_tensor_cosh_cuda + +- func: _foreach_cosh_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_cosh_slow_ + CUDA: foreach_tensor_cosh_cuda_ + autogen: _foreach_cosh.out + +- func: _foreach_erf(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_erf_slow + CUDA: foreach_tensor_erf_cuda + +- func: _foreach_erf_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_erf_slow_ + CUDA: foreach_tensor_erf_cuda_ + autogen: _foreach_erf.out + +- func: _foreach_erfc(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_erfc_slow + CUDA: foreach_tensor_erfc_cuda + +- func: _foreach_erfc_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_erfc_slow_ + CUDA: foreach_tensor_erfc_cuda_ + autogen: _foreach_erfc.out + +- func: _foreach_exp(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_exp_slow + CUDA: foreach_tensor_exp_cuda + +- func: _foreach_exp_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_exp_slow_ + CUDA: foreach_tensor_exp_cuda_ + autogen: _foreach_exp.out + +- func: _foreach_expm1(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_expm1_slow + CUDA: foreach_tensor_expm1_cuda + +- func: _foreach_expm1_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_expm1_slow_ + CUDA: foreach_tensor_expm1_cuda_ + autogen: _foreach_expm1.out + +- func: _foreach_floor(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_floor_slow + CUDA: foreach_tensor_floor_cuda + +- func: _foreach_floor_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_floor_slow_ + CUDA: foreach_tensor_floor_cuda_ + autogen: _foreach_floor.out + +- func: _foreach_frac(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_frac_slow + CUDA: foreach_tensor_frac_cuda + +- func: _foreach_frac_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_frac_slow_ + CUDA: foreach_tensor_frac_cuda_ + autogen: _foreach_frac.out + +- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow + CUDA: foreach_tensor_lerp_ternary_cuda + autogen: _foreach_lerp.List_out + +- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow_ + CUDA: foreach_tensor_lerp_ternary_cuda_ + autogen: _foreach_lerp.List_out + +- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow + CUDA: foreach_tensor_lerp_list_cuda + autogen: _foreach_lerp.Scalar_out + +- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow_ + CUDA: foreach_tensor_lerp_list_cuda_ + autogen: _foreach_lerp.Scalar_out + +- func: _foreach_lgamma(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_lgamma_slow + CUDA: foreach_tensor_lgamma_cuda + +- func: _foreach_lgamma_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_lgamma_slow_ + CUDA: foreach_tensor_lgamma_cuda_ + autogen: _foreach_lgamma.out + +- func: _foreach_log(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log_slow + CUDA: foreach_tensor_log_cuda + +- func: _foreach_log_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log_slow_ + CUDA: foreach_tensor_log_cuda_ + autogen: _foreach_log.out + +- func: _foreach_log10(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log10_slow + CUDA: foreach_tensor_log10_cuda + +- func: _foreach_log10_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log10_slow_ + CUDA: foreach_tensor_log10_cuda_ + autogen: _foreach_log10.out + +- func: _foreach_log1p(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log1p_slow + CUDA: foreach_tensor_log1p_cuda + +- func: _foreach_log1p_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log1p_slow_ + CUDA: foreach_tensor_log1p_cuda_ + autogen: _foreach_log1p.out + +- func: _foreach_log2(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log2_slow + CUDA: foreach_tensor_log2_cuda + +- func: _foreach_log2_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_log2_slow_ + CUDA: foreach_tensor_log2_cuda_ + autogen: _foreach_log2.out + +- func: _foreach_max(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_max_slow + CUDA: foreach_tensor_max_cuda + autogen: _foreach_max.out + +- func: _foreach_neg(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_neg_slow + CUDA: foreach_tensor_neg_cuda + +- func: _foreach_neg_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_neg_slow_ + CUDA: foreach_tensor_neg_cuda_ + autogen: _foreach_neg.out + +- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_norm_slow + CUDA: foreach_tensor_norm_cuda + autogen: _foreach_norm.Scalar_out + +- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow + CUDA: foreach_tensor_pow_list_kernel_cuda + +- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow + CUDA: foreach_tensor_pow_scalar_kernel_cuda + +- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow + CUDA: foreach_tensor_pow_scalarlist_kernel_cuda + +- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_scalar_pow_list_kernel_slow + CUDA: foreach_scalar_pow_list_kernel_cuda + +- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow_ + CUDA: foreach_tensor_pow_list_kernel_cuda_ + autogen: _foreach_pow.List_out + +- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow_ + CUDA: foreach_tensor_pow_scalar_kernel_cuda_ + autogen: _foreach_pow.Scalar_out + +- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow_ + CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_ + autogen: _foreach_pow.ScalarList_out + +- func: _foreach_reciprocal(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_reciprocal_slow + CUDA: foreach_tensor_reciprocal_cuda + +- func: _foreach_reciprocal_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_reciprocal_slow_ + CUDA: foreach_tensor_reciprocal_cuda_ + autogen: _foreach_reciprocal.out + +- func: _foreach_round(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_round_slow + CUDA: foreach_tensor_round_cuda + +- func: _foreach_round_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_round_slow_ + CUDA: foreach_tensor_round_cuda_ + autogen: _foreach_round.out + +- func: _foreach_sigmoid(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sigmoid_slow + CUDA: foreach_tensor_sigmoid_cuda + +- func: _foreach_sigmoid_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sigmoid_slow_ + CUDA: foreach_tensor_sigmoid_cuda_ + autogen: _foreach_sigmoid.out + +- func: _foreach_sign(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sign_slow + CUDA: foreach_tensor_sign_cuda + +- func: _foreach_sign_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sign_slow_ + CUDA: foreach_tensor_sign_cuda_ + autogen: _foreach_sign.out + +- func: _foreach_sin(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sin_slow + CUDA: foreach_tensor_sin_cuda + +- func: _foreach_sin_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sin_slow_ + CUDA: foreach_tensor_sin_cuda_ + autogen: _foreach_sin.out + +- func: _foreach_sinh(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sinh_slow + CUDA: foreach_tensor_sinh_cuda + +- func: _foreach_sinh_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sinh_slow_ + CUDA: foreach_tensor_sinh_cuda_ + autogen: _foreach_sinh.out + +- func: _foreach_sqrt(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sqrt_slow + CUDA: foreach_tensor_sqrt_cuda + +- func: _foreach_sqrt_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_sqrt_slow_ + CUDA: foreach_tensor_sqrt_cuda_ + autogen: _foreach_sqrt.out + +- func: _foreach_tan(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_tan_slow + CUDA: foreach_tensor_tan_cuda + +- func: _foreach_tan_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_tan_slow_ + CUDA: foreach_tensor_tan_cuda_ + autogen: _foreach_tan.out + +- func: _foreach_tanh(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_tanh_slow + CUDA: foreach_tensor_tanh_cuda + +- func: _foreach_tanh_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_tanh_slow_ + CUDA: foreach_tensor_tanh_cuda_ + autogen: _foreach_tanh.out + +- func: _foreach_trunc(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_trunc_slow + CUDA: foreach_tensor_trunc_cuda + +- func: _foreach_trunc_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_trunc_slow_ + CUDA: foreach_tensor_trunc_cuda_ + autogen: _foreach_trunc.out + +- func: _foreach_zero_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_zero_slow_ + CUDA: foreach_tensor_zero_cuda_ + autogen: _foreach_zero, _foreach_zero.out + +- func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_ + CUDA: foreach_tensor_copy_list_kernel_cuda_ + autogen: _foreach_copy.out + +- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out + device_check: NoCheck + variants: function + dispatch: + CompositeExplicitAutograd: _foreach_copy + +- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor + dispatch: + CPU: bucketize_cpu + CUDA: bucketize_cuda + MPS: bucketize_mps + +- func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: bucketize_out_cpu + CUDA: bucketize_out_cuda + MPS: bucketize_out_mps + +- func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor + dispatch: + CPU: bucketize_cpu + CUDA: bucketize_cuda + MPS: bucketize_mps + autogen: bucketize.Scalar_out + +- func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor + dispatch: + CPU: searchsorted_cpu + CUDA: searchsorted_cuda + MPS: searchsorted_mps + +- func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: searchsorted_out_cpu + CUDA: searchsorted_out_cuda + MPS: searchsorted_out_mps + +- func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor + dispatch: + CPU: searchsorted_cpu + CUDA: searchsorted_cuda + MPS: searchsorted_mps + +- func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: searchsorted_out_cpu + CUDA: searchsorted_out_cuda + MPS: searchsorted_out_mps + +- func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor + structured_delegate: _convert_indices_from_coo_to_csr.out + +- func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: _convert_indices_from_coo_to_csr_structured_cpu + CUDA: _convert_indices_from_coo_to_csr_structured_cuda + +- func: _convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor + structured_delegate: _convert_indices_from_csr_to_coo.out + +- func: _convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: _convert_indices_from_csr_to_coo_structured_cpu + CUDA: _convert_indices_from_csr_to_coo_structured_cuda + +## NN wrappers + +- func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: mse_loss_out + MPS: mse_loss_out_mps + +- func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: mse_loss.out + python_module: nn + +- func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: mse_loss_backward_out + MPS: mse_loss_backward_out_mps + +- func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: mse_loss_backward + MPS: mse_loss_backward_mps + +- func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + python_module: nn + +- func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu_out + CUDA: multi_margin_loss_cuda_out + +- func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu + CUDA: multi_margin_loss_cuda + +- func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu_backward_out + CUDA: multi_margin_loss_cuda_backward_out + +- func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu_backward + CUDA: multi_margin_loss_cuda_backward + +- func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + python_module: nn + +- func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!)) + python_module: nn + dispatch: + CPU: multilabel_margin_loss_forward_out_cpu + CUDA: multilabel_margin_loss_forward_out_cuda + +- func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target) + python_module: nn + dispatch: + CPU: multilabel_margin_loss_forward_cpu + CUDA: multilabel_margin_loss_forward_cuda + +- func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: multilabel_margin_loss_backward_cpu_out + CUDA: multilabel_margin_loss_backward_cuda_out + +- func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor + python_module: nn + dispatch: + CPU: multilabel_margin_loss_backward_cpu + CUDA: multilabel_margin_loss_backward_cuda + +- func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: nll_loss_nd_symint + +- func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: nll_loss_symint + +- func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: nll_loss_forward_out_cpu + CUDA: nll_loss_forward_out_cuda + MPS: nll_loss_forward_out_mps + +- func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight) + python_module: nn + structured_delegate: nll_loss_forward.output + +- func: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: nll_loss_backward_out_cpu + CUDA: nll_loss_backward_out_cuda + MPS: nll_loss_backward_out_mps + +- func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor + python_module: nn + structured_delegate: nll_loss_backward.grad_input + +- func: nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: nll_loss2d_symint + +- func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) + python_module: nn + dispatch: + CPU: nll_loss2d_forward_out_cpu + CUDA: nll_loss2d_forward_out_cuda + MPS: nll_loss2d_forward_out_mps + +- func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight) + python_module: nn + dispatch: + CPU: nll_loss2d_forward_cpu + CUDA: nll_loss2d_forward_cuda + MPS: nll_loss2d_forward_mps + +- func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: nll_loss2d_backward_out_cpu + CUDA: nll_loss2d_backward_out_cuda + MPS: nll_loss2d_backward_out_mps + +- func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor + python_module: nn + dispatch: + CPU: nll_loss2d_backward_cpu + CUDA: nll_loss2d_backward_cuda + MPS: nll_loss2d_backward_mps + +- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: smooth_l1_loss_out + MPS: smooth_l1_loss_out_mps + +- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: smooth_l1_loss.out + python_module: nn + +- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: smooth_l1_loss_backward_out + CUDA: smooth_l1_loss_backward_out + MPS: smooth_l1_loss_backward_out_mps + +- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: smooth_l1_loss_backward + +- func: huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: huber_loss_out + MPS: huber_loss_out_mps + +- func: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: huber_loss + MPS: huber_loss_mps + +- func: huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: huber_loss_backward_out + MPS: huber_loss_backward_out_mps + +- func: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: huber_loss_backward + +- func: soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss_out + +- func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss + +- func: soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss_backward_out + +- func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss_backward + +- func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: elu_out + MPS: elu_out_mps + +- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor + structured_delegate: elu.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: elu_backward_out + MPS: elu_backward_out_mps + +- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor + structured_delegate: elu_backward.grad_input + python_module: nn + +- func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) + structured_delegate: elu.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: glu_out + MPS: glu_out_mps + +- func: glu(Tensor self, int dim=-1) -> Tensor + structured_delegate: glu.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: glu_backward_cpu_out + CUDA: glu_backward_cuda_out + MPS: glu_backward_mps_out + +- func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor + python_module: nn + dispatch: + CPU: glu_backward_cpu + CUDA: glu_backward_cuda + MPS: glu_backward_mps + +- func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_jvp + autogen: glu_jvp.out + +- func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_backward_jvp + autogen: glu_backward_jvp.out + +- func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_out + MPS: hardsigmoid_out_mps + QuantizedCPU: hardsigmoid_out_quantized_cpu + +- func: hardsigmoid(Tensor self) -> Tensor + structured_delegate: hardsigmoid.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + QuantizedCPU: hardsigmoid_quantized_cpu + +- func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: hardsigmoid.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_backward_out + MPS: hardsigmoid_backward_out_mps + +- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor + structured_delegate: hardsigmoid_backward.grad_input + python_module: nn + +- func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA, MPS: hardtanh_out + QuantizedCPU: hardtanh_out_quantized_cpu + +- func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA, MPS: hardtanh + QuantizedCPU: hardtanh_quantized_cpu + tags: core + +- func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward_out + MPS: hardtanh_backward_out_mps + +- func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward + MPS: hardtanh_backward_mps + +- func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA, MPS: hardtanh_ + QuantizedCPU: hardtanh_quantized_cpu_ + +- func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardswish_out + MPS: hardswish_out_mps + +- func: hardswish(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardswish + MPS: hardswish_mps + +- func: hardswish_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardswish_ + MPS: hardswish_mps_ + +- func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: hardswish_backward + MPS: hardswish_backward_mps + autogen: hardswish_backward.out + +- func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: leaky_relu_out + MPS: leaky_relu_out_mps + QuantizedCPU: leaky_relu_out_quantized_cpu + +- func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor + structured_delegate: leaky_relu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + QuantizedCPU: leaky_relu_quantized_cpu + tags: core + +- func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: leaky_relu_backward_out + MPS: leaky_relu_backward_out_mps + +- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor + structured_delegate: leaky_relu_backward.grad_input + python_module: nn + +- func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) + structured_delegate: leaky_relu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + QuantizedCPU: leaky_relu_quantized_cpu_ + +- func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + +- func: log_sigmoid(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + +- func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!)) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU: log_sigmoid_forward_out_cpu + CUDA: log_sigmoid_forward_out_cuda + MPS: log_sigmoid_forward_out_mps + +- func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU: log_sigmoid_forward_cpu + CUDA: log_sigmoid_forward_cuda + MPS: log_sigmoid_forward_mps + +- func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: log_sigmoid_backward_cpu_out + CUDA: log_sigmoid_backward_cuda_out + MPS: log_sigmoid_backward_mps_out + +- func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor + python_module: nn + dispatch: + CPU: log_sigmoid_backward_cpu + CUDA: log_sigmoid_backward_cuda + MPS: log_sigmoid_backward_mps + +- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + tags: nondeterministic_seeded + dispatch: + CPU: rrelu_with_noise_out_cpu + CUDA: rrelu_with_noise_out_cuda + +- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor + python_module: nn + dispatch: + CPU: rrelu_with_noise_cpu + CUDA: rrelu_with_noise_cuda + tags: nondeterministic_seeded + +- func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: rrelu_with_noise_backward + autogen: rrelu_with_noise_backward.out + +- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) + python_module: nn + tags: nondeterministic_seeded + dispatch: + CPU: rrelu_with_noise_cpu_ + CUDA: rrelu_with_noise_cuda_ + +- func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: softplus_out + MPS: softplus_out_mps + +- func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor + structured_delegate: softplus.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: softplus_backward_out + MPS: softplus_backward_out_mps + +- func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor + structured_delegate: softplus_backward.grad_input + python_module: nn + +- func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: softshrink_out + MPS: softshrink_out_mps + +- func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor + structured_delegate: softshrink.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: softshrink_backward_out + MPS: softshrink_backward_out_mps + +- func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor + structured_delegate: softshrink_backward.grad_input + python_module: nn + +- func: adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: adaptive_avg_pool2d_out_cpu + CUDA: adaptive_avg_pool2d_out_cuda + MPS: adaptive_avg_pool2d_out_mps + MkldnnCPU: mkldnn_adaptive_avg_pool2d_out_stub + +- func: adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: adaptive_avg_pool2d_symint + +- func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor + dispatch: + MkldnnCPU: mkldnn_adaptive_avg_pool2d + +- func: mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + MkldnnCPU: mkldnn_adaptive_avg_pool2d_out + +- func: mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor + dispatch: + MkldnnCPU: mkldnn_adaptive_avg_pool2d_backward + autogen: mkldnn_adaptive_avg_pool2d_backward.out + +- func: _adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor + dispatch: + CPU: adaptive_avg_pool2d_cpu + CUDA: adaptive_avg_pool2d_cuda + MPS: adaptive_avg_pool2d_mps + QuantizedCPU: adaptive_avg_pool2d_quantized_cpu + QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda + autogen: _adaptive_avg_pool2d.out + tags: core + +- func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU: adaptive_avg_pool2d_backward_cpu + CUDA: adaptive_avg_pool2d_backward_cuda + MPS: adaptive_avg_pool2d_backward_mps + autogen: _adaptive_avg_pool2d_backward.out + tags: core + +- func: adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_out_cpu + CUDA: adaptive_avg_pool3d_out_cuda + QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu + +- func: adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: adaptive_avg_pool3d_symint + +- func: _adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor + dispatch: + CPU: adaptive_avg_pool3d_cpu + CUDA: adaptive_avg_pool3d_cuda + QuantizedCPU: adaptive_avg_pool3d_quantized_cpu + autogen: _adaptive_avg_pool3d.out + tags: core + +- func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_backward_out_cpu + CUDA: adaptive_avg_pool3d_backward_out_cuda + +- func: _adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_backward_cpu + CUDA: adaptive_avg_pool3d_backward_cuda + autogen: _adaptive_avg_pool3d_backward.out + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool2d_out_cpu + CUDA: adaptive_max_pool2d_out_cuda + MPS: adaptive_max_pool2d_out_mps + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor) + python_module: nn + structured_delegate: adaptive_max_pool2d.out + +- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool2d_backward_out_cpu + CUDA: adaptive_max_pool2d_backward_out_cuda + MPS: adaptive_max_pool2d_backward_out_mps + +- func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor + python_module: nn + structured_delegate: adaptive_max_pool2d_backward.grad_input + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool3d_out_cpu + CUDA: adaptive_max_pool3d_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor) + python_module: nn + structured_delegate: adaptive_max_pool3d.out + +- func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool3d_backward_out_cpu + CUDA: adaptive_max_pool3d_backward_out_cuda + +- func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor + python_module: nn + structured_delegate: adaptive_max_pool3d_backward.grad_input + +- func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + precomputed: + - kernel_size -> int kH, int kW + - stride -> int dH, int dW + - padding -> int padH, int padW + dispatch: + CPU: avg_pool2d_out_cpu + CUDA: avg_pool2d_out_cuda + MPS: avg_pool2d_out_mps + MkldnnCPU: mkldnn_avg_pool2d_out + +- func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor + python_module: nn + structured_delegate: avg_pool2d.out + dispatch: + MkldnnCPU: mkldnn_avg_pool2d + QuantizedCPU: avg_pool2d_quantized_cpu + tags: core + +- func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: avg_pool2d_backward_out_cpu + CUDA: avg_pool2d_backward_out_cuda + MPS: avg_pool2d_backward_out_mps + MkldnnCPU: mkldnn_avg_pool2d_backward_out + +- func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor + python_module: nn + structured_delegate: avg_pool2d_backward.grad_input + dispatch: + MkldnnCPU: mkldnn_avg_pool2d_backward + tags: core + +- func: avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: avg_pool3d_out_cpu + CUDA: avg_pool3d_out_cuda + MkldnnCPU: mkldnn_avg_pool3d_out + +- func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor + python_module: nn + structured_delegate: avg_pool3d.out + dispatch: + MkldnnCPU: mkldnn_avg_pool3d + QuantizedCPU: avg_pool3d_quantized_cpu + tags: core + +- func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: avg_pool3d_backward_out_cpu + CUDA: avg_pool3d_backward_out_cuda + MkldnnCPU: mkldnn_avg_pool3d_backward_out + +- func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor + python_module: nn + structured_delegate: avg_pool3d_backward.grad_input + dispatch: + MkldnnCPU: mkldnn_avg_pool3d_backward + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: fractional_max_pool2d_out_cpu + CUDA: fractional_max_pool2d_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor) + python_module: nn + structured_delegate: fractional_max_pool2d.output + +- func: fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: fractional_max_pool2d_backward_cpu + CUDA: fractional_max_pool2d_backward_cuda + +- func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor + python_module: nn + structured_delegate: fractional_max_pool2d_backward.grad_input + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + precomputed: + - kernel_size -> int poolSizeT, int poolSizeH, int poolSizeW + - output_size -> int outputT, int outputH, int outputW + - int numBatch, int numPlanes, int inputT, int inputH, int inputW + dispatch: + CPU: fractional_max_pool3d_out_cpu + CUDA: fractional_max_pool3d_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor) + python_module: nn + structured_delegate: fractional_max_pool3d.output + +- func: fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: fractional_max_pool3d_backward_out_cpu + CUDA: fractional_max_pool3d_backward_out_cuda + +- func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor + python_module: nn + dispatch: + CPU: fractional_max_pool3d_backward_cpu + CUDA: fractional_max_pool3d_backward_cuda + +# Return: (Tensor output, Tensor indices) +- func: max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: max_pool2d_with_indices_out_cpu + CUDA: max_pool2d_with_indices_out_cuda + MPS: max_pool2d_with_indices_out_mps + +# Return: (Tensor output, Tensor indices) +- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) + python_module: nn + structured_delegate: max_pool2d_with_indices.out + tags: core + +- func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: max_pool2d_with_indices_backward_out_cpu + CUDA: max_pool2d_with_indices_backward_out_cuda + MPS: max_pool2d_with_indices_backward_out_mps + +- func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor + python_module: nn + structured_delegate: max_pool2d_with_indices_backward.grad_input + tags: core + +# Return: (Tensor output, Tensor indices) +- func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_out_cpu + CUDA: max_pool3d_with_indices_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_cpu + CUDA: max_pool3d_with_indices_cuda + tags: core + +- func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_backward_out_cpu + CUDA: max_pool3d_with_indices_backward_out_cuda + +- func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_backward_cpu + CUDA: max_pool3d_with_indices_backward_cuda + +- func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: max_unpooling2d_forward_out_cpu + CUDA: max_unpooling2d_forward_out_cuda + +- func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor + python_module: nn + dispatch: + CPU: max_unpooling2d_forward_cpu + CUDA: max_unpooling2d_forward_cuda + +- func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: max_unpooling3d_forward_out_cpu + CUDA: max_unpooling3d_forward_out_cuda + +- func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor + python_module: nn + dispatch: + CPU: max_unpooling3d_forward_cpu + CUDA: max_unpooling3d_forward_cuda + +- func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad1d_out_cpu + QuantizedCPU: reflection_pad1d_out_quantized_cpu + CUDA: reflection_pad1d_out_cuda + MPS: reflection_pad1d_out_mps + +- func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad1d.out + tags: core + +- func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad1d_backward_out_cpu + CUDA: reflection_pad1d_backward_out_cuda + MPS: reflection_pad1d_backward_out_mps + +- func: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad1d_backward.grad_input + +- func: reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU, QuantizedCPU: reflection_pad2d_out_cpu + CUDA: reflection_pad2d_out_cuda + MPS: reflection_pad2d_out_mps + +- func: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + dispatch: + CPU: reflection_pad2d_cpu + QuantizedCPU: reflection_pad2d_quantized_cpu + CUDA: reflection_pad2d_cuda + MPS: reflection_pad2d_mps + tags: core + +- func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: reflection_pad2d_backward_out_cpu + CUDA: reflection_pad2d_backward_out_cuda + MPS: reflection_pad2d_backward_out_mps + +- func: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + dispatch: + CPU: reflection_pad2d_backward_cpu + CUDA: reflection_pad2d_backward_cuda + MPS: reflection_pad2d_backward_mps + +- func: reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad3d_out_cpu + CUDA: reflection_pad3d_out_cuda + MPS: reflection_pad3d_out_mps + +- func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad3d.out + tags: core + +- func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad3d_backward_out_cpu + CUDA: reflection_pad3d_backward_out_cuda + MPS: reflection_pad3d_backward_out_mps + +- func: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad3d_backward.grad_input + +- func: replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad1d_out_cpu + CUDA: replication_pad1d_out_cuda + MPS: replication_pad1d_out_mps + +- func: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad1d.out + +- func: replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad1d_backward_out_cpu + CUDA: replication_pad1d_backward_out_cuda + MPS: replication_pad1d_backward_out_mps + +- func: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad1d_backward.grad_input + +- func: replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad2d_out_cpu + CUDA: replication_pad2d_out_cuda + MPS: replication_pad2d_out_mps + +- func: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad2d.out + tags: core + +- func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: replication_pad2d_backward_out_cpu + CUDA: replication_pad2d_backward_out_cuda + MPS: replication_pad2d_backward_out_mps + +- func: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + dispatch: + CPU: replication_pad2d_backward_cpu + CUDA: replication_pad2d_backward_cuda + MPS: replication_pad2d_backward_mps + +- func: replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad3d_out_cpu + CUDA: replication_pad3d_out_cuda + MPS: replication_pad3d_out_mps + +- func: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad3d.out + tags: core + + +- func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: replication_pad3d_backward_out_cpu + CUDA: replication_pad3d_backward_out_cuda + MPS: replication_pad3d_backward_out_mps + +- func: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + dispatch: + CPU: replication_pad3d_backward_cpu + CUDA: replication_pad3d_backward_cuda + MPS: replication_pad3d_backward_mps + +- func: _pad_circular(Tensor self, SymInt[] pad) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: _pad_circular_symint + +- func: _pad_enum(Tensor self, SymInt[] pad, int mode, float? value=None) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: _pad_enum_symint + +- func: pad(Tensor self, SymInt[] pad, str mode="constant", float? value=None) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: pad_symint + +- func: upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_linear1d.vec_out + +- func: upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_bilinear2d.vec_out + tags: core + +- func: _upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_bilinear2d_aa.vec_out + +- func: upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_trilinear3d.vec_out + +- func: upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_bicubic2d.vec_out + +- func: _upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_bicubic2d_aa.vec_out + +- func: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_nearest1d.vec_out + +- func: _upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_nearest_exact1d.vec_out + +- func: upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_nearest2d.vec_out + tags: core + +- func: _upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_nearest_exact2d.vec_out + +- func: upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_nearest3d.vec_out + +- func: _upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_nearest_exact3d.vec_out + +# NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility. +- func: upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_linear1d_out_cpu + CUDA: upsample_linear1d_out_cuda + MPS: upsample_linear1d_out_mps + +- func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_linear1d.out + +- func: upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_linear1d_backward_out_cpu + CUDA: upsample_linear1d_backward_out_cuda + MPS: upsample_linear1d_backward_out_mps + +- func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_linear1d_backward.grad_input + +- func: upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bilinear2d_out_cpu + CUDA: upsample_bilinear2d_out_cuda + MPS: upsample_bilinear2d_out_mps + +- func: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bilinear2d.out + dispatch: + QuantizedCPU: upsample_bilinear2d_quantized_cpu + +- func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bilinear2d_backward_out_cpu + CUDA: upsample_bilinear2d_backward_out_cuda + MPS: upsample_bilinear2d_backward_out_mps + +- func: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bilinear2d_backward.grad_input + +- func: _upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bilinear2d_aa_out_cpu + CUDA: _upsample_bilinear2d_aa_out_cuda + +- func: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bilinear2d_aa.out + +- func: _upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bilinear2d_aa_backward_out_cpu + CUDA: _upsample_bilinear2d_aa_backward_out_cuda + +- func: _upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bilinear2d_aa_backward.grad_input + +- func: upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bicubic2d_out_cpu + CUDA: upsample_bicubic2d_out_cuda + +- func: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bicubic2d.out + +- func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bicubic2d_backward_out_cpu + CUDA: upsample_bicubic2d_backward_out_cuda + +- func: upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bicubic2d_backward.grad_input + +- func: _upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bicubic2d_aa_out_cpu + CUDA: _upsample_bicubic2d_aa_out_cuda + +- func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bicubic2d_aa.out + +- func: _upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bicubic2d_aa_backward_out_cpu + CUDA: _upsample_bicubic2d_aa_backward_out_cuda + +- func: _upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bicubic2d_aa_backward.grad_input + +- func: upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_trilinear3d_out_cpu + CUDA: upsample_trilinear3d_out_cuda + +- func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_trilinear3d.out + +- func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_trilinear3d_backward_out_cpu + CUDA: upsample_trilinear3d_backward_out_cuda + +- func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_trilinear3d_backward.grad_input + +- func: upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest1d_out_cpu + CUDA: upsample_nearest1d_out_cuda + MPS: upsample_nearest1d_out_mps + +- func: _upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact1d_out_cpu + CUDA: _upsample_nearest_exact1d_out_cuda + MPS: _upsample_nearest_exact1d_out_mps + +- func: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest1d.out + +- func: _upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact1d.out + +- func: upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest1d_backward_out_cpu + CUDA: upsample_nearest1d_backward_out_cuda + MPS: upsample_nearest1d_backward_out_mps + +- func: _upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact1d_backward_out_cpu + CUDA: _upsample_nearest_exact1d_backward_out_cuda + MPS: _upsample_nearest_exact1d_backward_out_mps + +- func: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest1d_backward.grad_input + +- func: _upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact1d_backward.grad_input + +- func: upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest2d_out_cpu + CUDA: upsample_nearest2d_out_cuda + MPS: upsample_nearest2d_out_mps + +- func: _upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact2d_out_cpu + CUDA: _upsample_nearest_exact2d_out_cuda + MPS: _upsample_nearest_exact2d_out_mps + +- func: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest2d.out + dispatch: + QuantizedCPU: upsample_nearest2d_quantized_cpu + +- func: _upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact2d.out + dispatch: + QuantizedCPU: _upsample_nearest_exact2d_quantized_cpu + +- func: upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest2d_backward_out_cpu + CUDA: upsample_nearest2d_backward_out_cuda + MPS: upsample_nearest2d_backward_out_mps + +- func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact2d_backward_out_cpu + CUDA: _upsample_nearest_exact2d_backward_out_cuda + MPS: _upsample_nearest_exact2d_backward_out_mps + +- func: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest2d_backward.grad_input + +- func: _upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact2d_backward.grad_input + +- func: upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest3d_out_cpu + CUDA: upsample_nearest3d_out_cuda + +- func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact3d_out_cpu + CUDA: _upsample_nearest_exact3d_out_cuda + +- func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest3d.out + dispatch: + QuantizedCPU: upsample_nearest3d_quantized_cpu + +- func: _upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact3d.out + dispatch: + QuantizedCPU: _upsample_nearest_exact3d_quantized_cpu + +- func: upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest3d_backward_out_cpu + CUDA: upsample_nearest3d_backward_out_cuda + +- func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact3d_backward_out_cpu + CUDA: _upsample_nearest_exact3d_backward_out_cuda + +- func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest3d_backward.grad_input + +- func: _upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact3d_backward.grad_input + +- func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sigmoid_backward_out + MPS: sigmoid_backward_out_mps + tags: pointwise + +- func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor + python_module: nn + structured_delegate: sigmoid_backward.grad_input + tags: pointwise + +- func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: logit_backward_out + MPS: logit_backward_out_mps + tags: pointwise + +- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor + python_module: nn + structured_delegate: logit_backward.grad_input + tags: pointwise + +- func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: tanh_backward_out + MPS: tanh_backward_out_mps + tags: pointwise + +- func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor + python_module: nn + structured_delegate: tanh_backward.grad_input + +# What's a thnn_conv_ versus a slow_conv_? +# +# Historically, we have inefficient implementations of convolutions +# coming from the THNN/THCUNN library. These convolutions typically +# operated by computing the Toeplitz matrix and then doing a matrix +# multiply with the input; this is very memory inefficient! However, +# occasionally, we really don't have anything better, so it's helpful +# to have these fallbacks when there is no more optimized implementation +# in cudnn or mkldnn, etc. Both thnn_ and slow_ convolutions fall +# into this bucket. +# +# The difference between these two designations, is that thnn_ refers +# to a convolution that is still written in the "legacy" style; that is, +# C code in the THNN/ or THCUNN/ directory. A slow_ convolution is +# one that is written in the native style: modern C++. Algorithmically, +# these are the same thing, but we give them different prefixes to +# make the operational distinction clear. + tags: pointwise + +- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: slow_conv_transpose2d_structured_cpu + CUDA: slow_conv_transpose2d_structured_cuda + +- func: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor + python_module: nn + structured_delegate: slow_conv_transpose2d.out + +- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: slow_conv_transpose3d_out_cpu + CUDA: slow_conv_transpose3d_out_cuda + +- func: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv_transpose3d_cpu + CUDA: slow_conv_transpose3d_cuda + +- func: thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor + python_module: nn + +- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!) + python_module: nn + dispatch: + CPU: slow_conv2d_forward_out_cpu + CUDA: slow_conv2d_forward_out_cuda + +- func: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv2d_forward_cpu + CUDA: slow_conv2d_forward_cuda + +- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + python_module: nn + dispatch: + CPU: slow_conv2d_backward_out_cpu + CUDA: slow_conv2d_backward_out_cuda + +- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) + python_module: nn + dispatch: + CPU: slow_conv2d_backward_cpu + CUDA: slow_conv2d_backward_cuda + autogen: _slow_conv2d_backward.output_mask_out + +- func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: nn + dispatch: + CUDA: conv_depthwise2d_cuda_out + +- func: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor + python_module: nn + dispatch: + CUDA: conv_depthwise2d_cuda + +- func: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor + python_module: nn + dispatch: + CUDA: conv_depthwise3d_cuda + autogen: conv_depthwise3d.out + +- func: slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor + python_module: nn + +- func: slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!) + python_module: nn + dispatch: + CPU: slow_conv3d_forward_out_cpu + +- func: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv3d_forward_cpu + +- func: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv_dilated2d_cpu + CUDA: slow_conv_dilated2d_cuda + autogen: slow_conv_dilated2d.out + +- func: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv_dilated3d_cpu + CUDA: slow_conv_dilated3d_cuda + autogen: slow_conv_dilated3d.out + +- func: col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: col2im_out_cpu + CUDA: col2im_out_cuda + +- func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor + python_module: nn + dispatch: + CPU: col2im_cpu + CUDA: col2im_cuda + tags: core + +- func: column_stack(Tensor[] tensors) -> Tensor + +- func: column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: im2col_out_cpu + CUDA: im2col_out_cuda + +- func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor + python_module: nn + dispatch: + CPU: im2col_cpu + CUDA: im2col_cuda + +- func: isfinite(Tensor self) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + +- func: isinf(Tensor self) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: isinf + SparseCPU, SparseCUDA: isinf_sparse + SparseMeta: isinf_sparse_meta + SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr + autogen: isinf.out + tags: [core, pointwise] + +- func: record_stream(Tensor(a!) self, Stream s) -> () + variants: method + dispatch: + CUDA: record_stream_cuda + +- func: isposinf(Tensor self) -> Tensor + variants: function, method + structured_delegate: isposinf.out + dispatch: + SparseCPU, SparseCUDA: isposinf_sparse + SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr + tags: pointwise + +- func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: isposinf_out + SparseCPU, SparseCUDA: isposinf_sparse_out + SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out + tags: pointwise + +- func: isneginf(Tensor self) -> Tensor + variants: function, method + structured_delegate: isneginf.out + dispatch: + SparseCPU, SparseCUDA: isneginf_sparse + SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr + tags: pointwise + +- func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: isneginf_out + SparseCPU, SparseCUDA: isneginf_sparse_out + SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out + tags: pointwise + +# NOTE [_add_batch_dim and _remove_batch_dim] +# _add_batch_dim and _remove_batch_dim are meant to be used in the implementation +# of the vmap frontend API (see torch/_vmap_internals.py). They are not +# user-facing, hence the leading underscore. Please don't use them them anywhere else. +- func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor + variants: function + +# See NOTE [_add_batch_dim and _remove_batch_dim] +- func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor + variants: function + +## Functions related to the `torch.special` namespace +# Note [special namespace binding] +# Functions in the special python module should have their names start with +# "special_" underscore and be bound to the desired Python name in +# torch/special/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/special.h. +# The "special_" names should be hidden from the user and not documented. + +- func: special_entr(Tensor self) -> Tensor + structured_delegate: special_entr.out + python_module: special + variants: function + tags: pointwise + +- func: special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_entr_out + tags: pointwise + +- func: special_ndtri(Tensor self) -> Tensor + structured_delegate: special_ndtri.out + python_module: special + variants: function + tags: pointwise + +- func: special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_ndtri_out + tags: pointwise + +- func: special_log_ndtr(Tensor self) -> Tensor + structured_delegate: special_log_ndtr.out + python_module: special + variants: function + tags: pointwise + +- func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_log_ndtr_out + tags: pointwise + +- func: special_expm1(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_exp2(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_psi(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_digamma(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammaln(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_erf(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_erfc(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_erfcx(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_erfcx.out + tags: pointwise + +- func: special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_erfcx_out + tags: pointwise + +- func: special_erfinv(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_ndtr(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_xlog1py(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + structured_delegate: special_xlog1py.out + tags: pointwise + +- func: special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py + tags: pointwise + +- func: special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py + tags: pointwise + +- func: special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_xlog1py_out + tags: pointwise + +- func: special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py_out + tags: pointwise + +- func: special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py_out + tags: pointwise + +- func: special_xlogy(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_zeta(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + structured_delegate: special_zeta.out + tags: pointwise + +- func: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta + tags: pointwise + +- func: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta + tags: pointwise + +- func: special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_zeta_out + tags: pointwise + +- func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta_out + tags: pointwise + +- func: special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta_out + tags: pointwise + +- func: special_i0(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_i0e(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i0e.out + tags: pointwise + +- func: special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i0e_out + tags: pointwise + +- func: special_i1(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i1.out + tags: pointwise + +- func: special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i1_out + tags: pointwise + +- func: special_i1e(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i1e.out + tags: pointwise + +- func: special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i1e_out + tags: pointwise + +- func: special_logit(Tensor self, float? eps=None) -> Tensor + python_module: special + variants: function + +- func: special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_polygamma(int n, Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor + python_module: special + variants: function + +- func: special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_expit(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_sinc(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_round(Tensor self, *, int decimals=0) -> Tensor + python_module: special + variants: function + +- func: special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_log1p(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + python_module: special + variants: function + +- func: special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammainc(Tensor self, Tensor other) -> Tensor + python_module: special + variants: function + +- func: special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammaincc(Tensor self, Tensor other) -> Tensor + python_module: special + variants: function + +- func: special_multigammaln(Tensor self, int p) -> Tensor + python_module: special + variants: function + +- func: special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + python_module: special + variants: function + +## Functions related to the fast Fourier transform and the torch.fft namespace +# Note [FFT namespace binding] +# Functions in the fft python module should have their names start with +# "fft_" underscore and be bound to the desired Python name in +# torch/fft/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/fft.h. +# The "fft_" names should be hidden from the user and not documented. +# +# See fft_fft as an example. + +# torch.fft.fft +# NOTE: NOT an alias for torch.fft, which has different semantics +- func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft_symint + +- func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft_symint_out + +- func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft_symint + +- func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft_symint_out + +- func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft_symint + +- func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft_symint_out + +- func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft_symint + +- func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft_symint_out + +- func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft_symint + +- func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft_symint_out + +- func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft_symint + +- func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft_symint_out + +- func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft2_symint + +- func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft2_symint_out + +- func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft2_symint + +- func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft2_symint_out + +- func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft2_symint + +- func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft2_symint_out + +- func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft2_symint + +- func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft2_symint_out + +- func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft2_symint + +- func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft2_symint_out + +- func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft2_symint + +- func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft2_symint_out + +- func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fftn_symint + +- func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fftn_symint_out + +- func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifftn_symint + +- func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifftn_symint_out + +- func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfftn_symint + +- func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfftn_symint_out + +- func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfftn_symint + +- func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfftn_symint_out + +- func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfftn_symint + +- func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfftn_symint_out + +- func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfftn_symint + +- func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfftn_symint_out + +- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_fftfreq + +- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_fftfreq_out + +- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_rfftfreq + +- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_rfftfreq_out + +- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + variants: function + +- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + variants: function + +## Functions for linear algebra and the torch.linalg namespace +# Note [linalg namespace binding] +# Functions in the linalg python module should have their names start with +# "linalg_" and be bound to the desired Python name in +# torch/linalg/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/linalg.h. +# The "linalg_" names should be hidden from the user and not documented. +# +# See linalg_det as an example. + +# "_ex" stands for experimental +- func: linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info) + python_module: linalg + structured_delegate: linalg_cholesky_ex.L + +- func: linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_cholesky_ex_out + +- func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor + python_module: linalg + +- func: linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor + python_module: linalg + variants: function + structured_delegate: linalg_cross.out + dispatch: + ZeroTensor: linalg_cross_zerotensor + +- func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + structured: True + dispatch: + CPU, CUDA, MPS: linalg_cross_out + +# linalg.lu_factor +- func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots) + python_module: linalg + variants: function + +- func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots) + python_module: linalg + variants: function + +- func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info) + python_module: linalg + structured_delegate: linalg_lu_factor_ex.out + variants: function + +- func: linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_factor_ex_out + +# linalg.lu +- func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U) + python_module: linalg + structured_delegate: linalg_lu.out + variants: function + +- func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_out + +# linalg.lu_solve +- func: linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor + python_module: linalg + structured_delegate: linalg_lu_solve.out + variants: function + +- func: linalg_lu_solve.out(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_solve_out + +# linalg.det +- func: _linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots) + structured_delegate: _linalg_det.result + +- func: _linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) + structured: True + dispatch: + CPU, CUDA: _linalg_det_out + +- func: linalg_det(Tensor A) -> Tensor + python_module: linalg + variants: function + +- func: linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +# torch.det, alias for torch.linalg.det +- func: det(Tensor self) -> Tensor + variants: function, method + +- func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info) + structured_delegate: linalg_ldl_factor_ex.out + python_module: linalg + variants: function + +- func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_factor_ex_out + +- func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor + structured_delegate: linalg_ldl_solve.out + python_module: linalg + variants: function + +- func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_solve_out + +- func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values) + python_module: linalg + variants: function + dispatch: + CompositeExplicitAutograd: linalg_lstsq + tags: dynamic_output_shape + +- func: linalg_lstsq.out(Tensor self, Tensor b, float? rcond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_lstsq_out + tags: dynamic_output_shape + +# torch.linalg.matmul, alias for torch.matmul +- func: linalg_matmul(Tensor self, Tensor other) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_vecdot(Tensor x, Tensor y, *, int dim=-1) -> Tensor + python_module: linalg + variants: function + +- func: linalg_vecdot.out(Tensor x, Tensor y, *, int dim=-1, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_matrix_exp(Tensor self) -> Tensor + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_matrix_exp + autogen: linalg_matrix_exp.out + +- func: _linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots) + structured_delegate: _linalg_slogdet.sign + +- func: _linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) + structured: True + dispatch: + CPU, CUDA: _linalg_slogdet_out + +- func: linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet) + python_module: linalg + +- func: linalg_slogdet.out(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet) + python_module: linalg + +- func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) + variants: function, method + +- func: slogdet.out(Tensor self, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet) + variants: function + +- func: logdet(Tensor self) -> Tensor + variants: function, method + +- func: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors) + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_eig + +- func: linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) + python_module: linalg + dispatch: + CPU, CUDA: linalg_eig_out + +- func: _linalg_eigvals(Tensor self) -> Tensor + python_module: linalg + dispatch: + CPU, CUDA: _linalg_eigvals + +- func: linalg_eigvals(Tensor self) -> Tensor + python_module: linalg + +- func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + dispatch: + CPU, CUDA: linalg_eigvals_out + +# This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and +# `linalg.eigvalsh` as composite functions that call this one +- func: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors) + structured_delegate: _linalg_eigh.eigenvalues + +- func: _linalg_eigh.eigenvalues(Tensor A, str UPLO="L", bool compute_v=True, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) + structured: True + dispatch: + CPU, CUDA: _linalg_eigh_out + +- func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) + python_module: linalg + +- func: linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) + python_module: linalg + +- func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor + python_module: linalg + +- func: linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_householder_product(Tensor input, Tensor tau) -> Tensor + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_householder_product + +- func: linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + dispatch: + CPU, CUDA: linalg_householder_product_out + +- func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info) + python_module: linalg + structured_delegate: linalg_inv_ex.inverse + +- func: linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_inv_ex_out + MPS: linalg_inv_ex_out_mps + +- func: linalg_inv(Tensor A) -> Tensor + python_module: linalg + +- func: linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: inverse(Tensor self) -> Tensor + variants: function, method + +- func: inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: inner(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: outer(Tensor self, Tensor vec2) -> Tensor + variants: function, method + +- func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) + +# torch.ger, alias for torch.outer +- func: ger(Tensor self, Tensor vec2) -> Tensor + variants: function, method + +- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) + +- func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + structured_delegate: linalg_vector_norm.out + +- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_vector_norm_out + MPS: linalg_vector_norm_out_mps + +- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + +- func: linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_matrix_norm.str_ord(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + +- func: linalg_matrix_norm.str_ord_out(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +# This function is exposes the `compute_uv` flag, which is then used to implement `linalg.svd` and +# `linalg.svdvals` as composite functions that call this one +- func: _linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh) + variants: function + structured_delegate: _linalg_svd.U + +- func: _linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) + structured: True + dispatch: + CPU, CUDA: _linalg_svd_out + +- func: linalg_svd(Tensor A, bool full_matrices=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh) + python_module: linalg + variants: function + +- func: linalg_svd.U(Tensor A, bool full_matrices=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) + python_module: linalg + variants: function + +- func: linalg_svdvals(Tensor A, *, str? driver=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_svdvals.out(Tensor A, *, str? driver=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_cond.p_str(Tensor self, str p) -> Tensor + python_module: linalg + variants: function + +- func: linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + dispatch: + # calls svd, which calls mH() (view op) + # also calls narrow() + CompositeExplicitAutogradNonFunctional: linalg_pinv + +- func: linalg_pinv.atol_rtol_tensor_out(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + dispatch: + CompositeExplicitAutograd: linalg_pinv_out + +- func: linalg_pinv.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_pinv.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_pinv(Tensor self, float rcond, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_pinv.rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_pinv.out(Tensor self, float rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: _linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info) + structured_delegate: _linalg_solve_ex.result + +- func: _linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) + structured: True + dispatch: + CPU, CUDA: _linalg_solve_ex_out + +- func: linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info) + python_module: linalg + +- func: linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info) + python_module: linalg + +- func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor + python_module: linalg + +- func: linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_tensorinv(Tensor self, int ind=2) -> Tensor + python_module: linalg + variants: function + +- func: linalg_tensorinv.out(Tensor self, int ind=2, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R) + python_module: linalg + variants: function + structured_delegate: linalg_qr.out + +- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_qr_out + +- func: linalg_matrix_power(Tensor self, int n) -> Tensor + python_module: linalg + +- func: linalg_matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_matrix_rank.atol_rtol_tensor(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matrix_rank.atol_rtol_tensor_out(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_matrix_rank.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_matrix_rank.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_matrix_rank(Tensor self, float tol, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matrix_rank.out(Tensor self, float tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_matrix_rank.tol_tensor(Tensor input, Tensor tol, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matrix_rank.out_tol_tensor(Tensor input, Tensor tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_multi_dot(Tensor[] tensors) -> Tensor + python_module: linalg + +- func: linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +## Functions related to the `torch.nested` namespace +# Note [nested namespace binding] +# Functions in the nested python module should have their names start with +# "nested_" underscore and be bound to the desired Python name in +# torch/nested/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/nested.h. +# The "nested_" names should be hidden from the user and not documented. + +- func: nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor + python_module: nested + variants: function + +## Functions that are only for testing +# It is undocumented and should not be used outside of tests. +- func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor + +# Note: for testing COW materialization within `at::parallel_for` loop function +- func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _test_parallel_materialize + +# Note: this function is only for testing. +- func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor + python_module: nn + dispatch: + CPU: _test_optional_intlist + autogen: _test_optional_intlist.out + +# Note: this function is only for testing. +- func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor + python_module: nn + dispatch: + CPU: _test_optional_intlist + autogen: _test_optional_filled_intlist.out + +# Note: this function is only for testing. +- func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor + python_module: nn + dispatch: + CPU: _test_optional_floatlist + autogen: _test_optional_floatlist.out + +# Note: this function is only for testing. +- func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor + python_module: nn + +# Note: this function is only for testing. +- func: _test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor + python_module: nn + +# Note: this function is only for testing. +- func: _test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor + cpp_no_default_args: ['a', 'b'] + python_module: nn + +# Note: this function is only for testing. +- func: _test_warn_in_autograd(Tensor self) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: _test_warn_in_autograd + autogen: _test_warn_in_autograd.out + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor + dispatch: + # the NestedTensor keys are necessary because NestedTensor has been removed + # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys] + CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage + autogen: _test_autograd_multiple_dispatch.fullcoverage_out + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor + dispatch: + CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a) + dispatch: + CompositeExplicitAutograd: _test_autograd_multiple_dispatch_view + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _test_autograd_multiple_dispatch_view_copy + tags: view_copy + autogen: _test_autograd_multiple_dispatch_view_copy.out + +- func: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor + variants: function + dispatch: + CPU, CUDA: segment_reduce_kernel + autogen: segment_reduce.out + +- func: _segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None) -> Tensor + variants: function + dispatch: + CPU, CUDA: _segment_reduce_backward_kernel + autogen: _segment_reduce_backward.out + +- func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor + python_module: nn + variants: function + +- func: flatten_dense_tensors(Tensor[] tensors) -> Tensor + variants: function + python_module: nn + +- func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[] + variants: function + python_module: nn + +- func: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _nested_tensor_from_tensor_list + autogen: _nested_tensor_from_tensor_list.out + +- func: _fw_primal_copy(Tensor self, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _fw_primal_copy + tags: view_copy + autogen: _fw_primal_copy.out + +- func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _make_dual_copy + tags: view_copy + autogen: _make_dual_copy.out + +- func: view_as_real_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_as_real_copy + tags: view_copy + autogen: view_as_real_copy.out + +- func: view_as_complex_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_as_complex_copy + tags: view_copy + autogen: view_as_complex_copy.out + +- func: _conj_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _conj_copy + tags: view_copy + autogen: _conj_copy.out + +- func: _neg_view_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _neg_view_copy + tags: view_copy + autogen: _neg_view_copy.out + +- func: as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: as_strided_copy_symint + tags: view_copy + autogen: as_strided_copy.out + +- func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _sparse_broadcast_to_copy + tags: view_copy + autogen: _sparse_broadcast_to_copy.out + +- func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: diagonal_copy + tags: view_copy + autogen: diagonal_copy.out + +- func: expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: expand_copy_symint + tags: view_copy + autogen: expand_copy.out + +- func: permute_copy(Tensor self, int[] dims) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: permute_copy + tags: view_copy + autogen: permute_copy.out + +- func: _reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint + tags: view_copy + autogen: _reshape_alias_copy.out + +- func: select_copy.int(Tensor self, int dim, SymInt index) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: select_copy_symint + SparseCsrCPU, SparseCsrCUDA: select_copy_sparse_csr + tags: view_copy + autogen: select_copy.int_out + +- func: detach_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: detach_copy + tags: view_copy + autogen: detach_copy.out + +- func: slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: slice_copy_Tensor_symint + tags: view_copy + autogen: slice_copy.Tensor_out + +- func: split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: split_copy_Tensor_symint + tags: view_copy + +- func: split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: split_with_sizes_copy_symint + tags: view_copy + +- func: squeeze_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: squeeze_copy + tags: view_copy + autogen: squeeze_copy.out + +- func: squeeze_copy.dim(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: squeeze_copy_dim + tags: view_copy + autogen: squeeze_copy.dim_out + +- func: squeeze_copy.dims(Tensor self, int[] dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: squeeze_copy_dims + tags: view_copy + autogen: squeeze_copy.dims_out + +- func: t_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: t_copy + tags: view_copy + autogen: t_copy.out + +- func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: transpose_copy_int + tags: view_copy + autogen: transpose_copy.int_out + +- func: unsqueeze_copy(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: unsqueeze_copy + tags: view_copy + autogen: unsqueeze_copy.out + +- func: _indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _indices_copy + tags: view_copy + autogen: _indices_copy.out + +- func: _values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _values_copy + tags: view_copy + autogen: _values_copy.out + +- func: indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: indices_copy + tags: view_copy + autogen: indices_copy.out + +- func: values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: values_copy + tags: view_copy + autogen: values_copy.out + +- func: crow_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: crow_indices_copy + tags: view_copy + autogen: crow_indices_copy.out + +- func: col_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: col_indices_copy + tags: view_copy + autogen: col_indices_copy.out + +- func: ccol_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: ccol_indices_copy + tags: view_copy + autogen: ccol_indices_copy.out + +- func: row_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: row_indices_copy + tags: view_copy + autogen: row_indices_copy.out + +- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: unbind_copy_int + tags: view_copy + +- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: unbind_copy_int_out + +- func: split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_copy_Tensor_out + + +- func: split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_with_sizes_copy_out + CUDA: split_with_sizes_copy_out_cuda + +- func: view_copy(Tensor self, SymInt[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_copy_symint + tags: view_copy + autogen: view_copy.out + +- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_copy_dtype + tags: view_copy + autogen: view_copy.dtype_out + +- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: unfold_copy + tags: view_copy + autogen: unfold_copy.out + +- func: alias_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: alias_copy + tags: view_copy + autogen: alias_copy.out + +- func: to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor + variants: method + dispatch: + NestedTensorCPU: NestedTensor_to_padded_tensor_generic + NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda + autogen: to_padded_tensor.out + +- func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor + variants: function + dispatch: + CUDA: _fbgemm_jagged_to_padded_dense_forward + +- func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor + variants: function + dispatch: + CUDA: _fbgemm_dense_to_jagged_forward_symint + +- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor + dispatch: + NestedTensorCPU: NestedTensor_softmax_dropout + NestedTensorCUDA: NestedTensor_softmax_dropout_cuda + tags: nondeterministic_seeded + +# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is. +- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor + variants: function + dispatch: + CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward + autogen: _transformer_encoder_layer_fwd.out + +- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor) + variants: function + dispatch: + CPU, NestedTensorCPU: native_multi_head_attention_cpu + CUDA, NestedTensorCUDA: native_multi_head_attention_cuda + autogen: _native_multi_head_attention.out + +- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor + python_module: nn + variants: function + autogen: scaled_dot_product_attention.out + tags: nondeterministic_seeded + +# This aten function is kept so that we can test the choice function from Python +- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int + dispatch: + Meta: _fused_sdp_choice_meta + CPU, NestedTensorCPU: _fused_sdp_choice_cpp + CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor) + variants: function + tags: nondeterministic_seeded + +- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) + dispatch: + CUDA: _scaled_dot_product_flash_attention_cuda + NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp) + dispatch: + CPU: _scaled_dot_product_flash_attention_cpu + tags: nondeterministic_seeded + +- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) + device_check: NoCheck + variants: function + dispatch: + CUDA: _scaled_dot_product_flash_attention_backward_cuda + NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested + +- func: _scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) + device_check: NoCheck + variants: function + dispatch: + CPU: _scaled_dot_product_flash_attention_cpu_backward + +- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset) + dispatch: + CUDA: _scaled_dot_product_efficient_attention_cuda + NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor) + device_check: NoCheck + dispatch: + CUDA: _scaled_dot_product_efficient_attention_backward_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset) + dispatch: + CUDA: _scaled_dot_product_cudnn_attention_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _scaled_dot_product_cudnn_attention_backward_cuda + tags: nondeterministic_seeded + +- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) + variants: function + dispatch: + CUDA: _flash_attention_forward + tags: nondeterministic_seeded + +- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor) + device_check: NoCheck + variants: function + dispatch: + CUDA: _flash_attention_backward + +# Returns output, logsumexp if compute_logsumexp +- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) + variants: function + dispatch: + CUDA: _efficient_attention_forward + tags: nondeterministic_seeded + +- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None, bool shared_storage_dqdkdv=False) -> (Tensor, Tensor, Tensor, Tensor) + device_check: NoCheck + variants: function + dispatch: + CUDA: _efficient_attention_backward + +- func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor + variants: function + dispatch: + CUDA: triton_scaled_dot_attention + tags: nondeterministic_seeded + autogen: _triton_scaled_dot_attention.out + +- func: _fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!) + variants: function + dispatch: + CUDA: _fill_mem_eff_dropout_mask_ + tags: nondeterministic_seeded + +- func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor + variants: function + dispatch: + CUDA: triton_multi_head_attention + autogen: _triton_multi_head_attention.out + +- func: special_airy_ai(Tensor x) -> Tensor + python_module: special + structured_delegate: special_airy_ai.out + variants: function + tags: pointwise + +- func: special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_airy_ai_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_j0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_j0.out + variants: function + tags: pointwise + +- func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_j0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_j1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_j1.out + variants: function + tags: pointwise + +- func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_j1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_y0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_y0.out + variants: function + tags: pointwise + +- func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_y0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_y1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_y1.out + variants: function + tags: pointwise + +- func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_y1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_t.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_t_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_u.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_u_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_v.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_v_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_w.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_w_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_hermite_polynomial_h.out + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_hermite_polynomial_h_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_hermite_polynomial_he.out + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_hermite_polynomial_he_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_laguerre_polynomial_l.out + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_laguerre_polynomial_l_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_legendre_polynomial_p.out + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_legendre_polynomial_p_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_modified_bessel_i0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_i0.out + variants: function + tags: pointwise + +- func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_i0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_modified_bessel_i1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_i1.out + variants: function + tags: pointwise + +- func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_i1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_modified_bessel_k0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_k0.out + variants: function + tags: pointwise + +- func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_k0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_modified_bessel_k1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_k1.out + variants: function + tags: pointwise + +- func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_k1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k0(Tensor x) -> Tensor + python_module: special + structured_delegate: special_scaled_modified_bessel_k0.out + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_scaled_modified_bessel_k0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k1(Tensor x) -> Tensor + python_module: special + structured_delegate: special_scaled_modified_bessel_k1.out + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_scaled_modified_bessel_k1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_t.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_t_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_u.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_u_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_v.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_v_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_w.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_w_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_spherical_bessel_j0(Tensor x) -> Tensor + python_module: special + structured_delegate: special_spherical_bessel_j0.out + variants: function + tags: pointwise + +- func: special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_spherical_bessel_j0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +# Aux function used in the test TestPythonDispatch.test_kwarg_only_and_positional_default +# within test/test_python_dispatch.py +- func: _foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor + dispatch: + CPU: foobar + autogen: _foobar.out + +- func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + variants: function + dispatch: + CPU: _fused_adam_kernel_cpu_ + CUDA: _fused_adam_kernel_cuda_ + autogen: _fused_adam, _fused_adam.out + +- func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), + # but still skip the device check as the Tensor LR can be on CPU + device_check: NoCheck + variants: function + dispatch: + CPU: _fused_adam_kernel_cpu_ + CUDA: _fused_adam_kernel_cuda_ + autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out + +- func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + variants: function + dispatch: + CPU: _fused_adamw_kernel_cpu_ + CUDA: _fused_adamw_kernel_cuda_ + autogen: _fused_adamw, _fused_adamw.out + +- func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), + # but still skip the device check as the Tensor LR can be on CPU + device_check: NoCheck + variants: function + dispatch: + CPU: _fused_adamw_kernel_cpu_ + CUDA: _fused_adamw_kernel_cuda_ + autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out + +- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + variants: function + dispatch: + CPU: _fused_sgd_kernel_cpu_ + CUDA: _fused_sgd_kernel_cuda_ + autogen: _fused_sgd, _fused_sgd.out + +- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + # but still skip the device check as the Tensor LR can be on CPU + device_check: NoCheck + variants: function + dispatch: + CPU: _fused_sgd_kernel_cpu_ + CUDA: _fused_sgd_kernel_cuda_ + autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out + +- func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + variants: function + dispatch: + CPU: _fused_adagrad_kernel_cpu_ + autogen: _fused_adagrad, _fused_adagrad.out + +# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts. +- func: _propagate_xla_data(Tensor input, Tensor output) -> () + variants: function diff --git a/torchgen/operator_versions/__init__.py b/torchgen/operator_versions/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchgen/operator_versions/gen_mobile_upgraders.py b/torchgen/operator_versions/gen_mobile_upgraders.py new file mode 100644 index 00000000000..362ce427d50 --- /dev/null +++ b/torchgen/operator_versions/gen_mobile_upgraders.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import os +from enum import Enum +from operator import itemgetter +from pathlib import Path +from typing import Any + +import torch +from torch.jit.generate_bytecode import generate_upgraders_bytecode +from torchgen.code_template import CodeTemplate +from torchgen.operator_versions.gen_mobile_upgraders_constant import ( + MOBILE_UPGRADERS_HEADER_DESCRIPTION, +) + + +class ByteCode(Enum): + instructions = 1 + constants = 2 + types = 3 + operators = 4 + register_size = 5 + + +EXCLUDED_OP_SET = [ + "aten::full.names", + "aten::full.out", + "aten::full", +] + +EXCLUE_UPGRADER_SET = ["full_0_4", "full_out_0_4"] + +ONE_INSTRUCTION = CodeTemplate( + """ + Instruction{OpCode::${operator_name}, ${X}, ${N}},""" +) + +INSTRUCTION_LIST = CodeTemplate( + """std::vector({ + ${instruction_list} + }), // instructions list""" +) + +ONE_CONSTANT = CodeTemplate( + """ + c10::IValue(${constant}),""" +) + +CONSTANT_LIST = CodeTemplate( + """std::vector({ + ${constant_list} + }), // constants list""" +) + +CONSTANTS_LIST_EMPTY = """std::vector(), // constants list""" + +ONE_TYPE = CodeTemplate("""c10::parseType("${type_str}"),""") + +TYPE_LIST = CodeTemplate( + """std::vector({ + ${type_list} + }), // types list""" +) + +TYPE_LIST_EMPTY = """std::vector(), // types list""" + +ONE_OPERATOTR_STRING = CodeTemplate( + """ + OperatorString({"${operator_name}", "${overload_name}", ${num_of_args}}),""" +) + +OPERATOR_STRING_LIST = CodeTemplate( + """ + std::vector({ + ${operator_string_list} + }), // operators list""" +) + +ONE_UPGRADER_FUNCTION = CodeTemplate( + """ + mobile::Function::registerFunc( + "${upgrader_name}", + ${instruction_list}, + ${constant_list}, + ${type_list}, + ${register_size} + )""" +) + +ONE_UPGRADER_SRC = CodeTemplate( + """ + ByteCodeFunctionWithOperator({ + ${bytecode_function}, + ${operator_string_list} + }),""" +) + + +ONE_UPGRADER_IN_VERSION_MAP = CodeTemplate( + """Upgrader({${upgrader_min_version}, ${upgrader_max_version}, "${upgrader_name}", ${bytecode_func_index}})""" +) # noqa: E501 + +ONE_OPERATOR_IN_VERSION_MAP = CodeTemplate( + """ + {std::string("${operator_name}"), + std::vector({ + ${upgrader_list_in_version_map} + })},""" +) + + +OPERATOR_VERSION_MAP = CodeTemplate( + """ +const std::unordered_map> +getOperatorVersionMapForMobile() { + static std::unordered_map> + operatorVersionMapForMobile({ + ${operator_list_in_version_map} + }); + return operatorVersionMapForMobile; +} +""" +) + + +UPGRADER_CPP_SRC = CodeTemplate( + MOBILE_UPGRADERS_HEADER_DESCRIPTION + + """ +#include +#include + +namespace c10 { +TypePtr parseType(const std::string& pythonStr); +} // namespace c10 + +namespace torch { +namespace jit { + +// clang-format off + +// From operator_versions_map +${operator_version_map} + +const std::vector& getUpgraderBytecodeList() { + auto generate_upgrader_bytecode_list = []() { + std::vector upgrader_function_list({ + ${upgrader_bytecode} + }); + for (const auto& upgrader_function : upgrader_function_list) { + for (const auto& op : upgrader_function.operators) { + upgrader_function.function.append_operator( + op.name, + op.overload_name, + op.num_specified_args); + } + } + return upgrader_function_list; + }; + static std::vector upgraderBytecodeList = + generate_upgrader_bytecode_list(); + return upgraderBytecodeList; +} + +// clang-format on + +} // namespace jit +} // namespace torch +""" +) + +UPGRADER_MOBILE_FILE_NAME = "upgrader_mobile.cpp" + +UPGRADER_ELEMENT = CodeTemplate( + """\ +Upgrader({${min_version}, ${max_version}, ${operator_name}, ${index}}), +""" +) + +PER_OPERATOR_UPGRADER_LIST = CodeTemplate( + """\ +{ + std::string(${operator_name}), + std::vector({${upgrader_list}}); +} +""" +) + + +def construct_instruction(instruction_list_from_yaml: list[Any]) -> str: + instruction_list_part = [] + for instruction in instruction_list_from_yaml: + instruction_list_part.append( + ONE_INSTRUCTION.substitute( + operator_name=instruction[0], + X=instruction[1], + N=instruction[2], + ) + ) + return INSTRUCTION_LIST.substitute( + instruction_list="".join(instruction_list_part).lstrip("\n") + ) + + +def construct_constants(constants_list_from_yaml: list[Any]) -> str: + constants_list_part = [] + for constant_from_yaml in constants_list_from_yaml: + convert_constant = None + if isinstance(constant_from_yaml, str): + # Add quotes if it's string + convert_constant = f'"{constant_from_yaml}"' + elif isinstance(constant_from_yaml, bool): + convert_constant = "true" if constant_from_yaml else "false" + elif constant_from_yaml is None: + convert_constant = "" + elif isinstance(constant_from_yaml, int): + convert_constant = str(constant_from_yaml) + else: + raise ValueError( + f"The type of {constant_from_yaml} is {type(constant_from_yaml)}. " + "Please add change in construct_constants function in gen_mobile_upgraders.py." + ) + constants_list_part.append(ONE_CONSTANT.substitute(constant=convert_constant)) + if len(constants_list_part) == 0: + return CONSTANTS_LIST_EMPTY + return CONSTANT_LIST.substitute( + constant_list="".join(constants_list_part).lstrip("\n") + ) + + +def construct_operators(operator_list_from_yaml: list[Any]) -> str: + operator_list_part = [] + for operator in operator_list_from_yaml: + operator_list_part.append( + ONE_OPERATOTR_STRING.substitute( + operator_name=operator[0], + overload_name=operator[1], + num_of_args=operator[2], + ) + ) + return OPERATOR_STRING_LIST.substitute( + operator_string_list="".join(operator_list_part).lstrip("\n") + ) + + +def construct_types(types_tr_list_from_yaml: list[Any]) -> str: + types_tr_list_part = [] + for types_tr in types_tr_list_from_yaml: + types_tr_list_part.append(ONE_TYPE.substitute(type_str=types_tr)) + if len(types_tr_list_part) == 0: + return TYPE_LIST_EMPTY + return TYPE_LIST.substitute(type_list="".join(types_tr_list_part).lstrip("\n")) + + +def construct_register_size(register_size_from_yaml: int) -> str: + if not isinstance(register_size_from_yaml, int): + raise ValueError( + f"Input register size is {register_size_from_yaml} and" + "it's type is {type(register_size_from_yaml)}. An int type is expected." + ) + return str(register_size_from_yaml) + + +def construct_version_maps( + upgrader_bytecode_function_to_index_map: dict[str, Any] +) -> str: + version_map = torch._C._get_operator_version_map() + sorted_version_map_ = sorted(version_map.items(), key=itemgetter(0)) # type: ignore[no-any-return] + sorted_version_map = dict(sorted_version_map_) + + operator_list_in_version_map_part = [] + for op_name in sorted_version_map: + upgraders_in_version_map_part = [] + # TODO: remove the skip after these two operators schemas are fixed + if op_name in EXCLUDED_OP_SET: + continue + upgrader_ranges = torch._C._get_upgrader_ranges(op_name) + upgrader_entries = sorted_version_map[op_name] + assert len(upgrader_ranges) == len(upgrader_entries) + for idx, upgrader_entry in enumerate(upgrader_entries): + upgrader_name = upgrader_entry.upgrader_name + bytecode_function_index = upgrader_bytecode_function_to_index_map[ + upgrader_name + ] + upgraders_in_version_map_part.append( + ONE_UPGRADER_IN_VERSION_MAP.substitute( + upgrader_min_version=upgrader_ranges[idx].min_version, + upgrader_max_version=upgrader_ranges[idx].max_version, + upgrader_name=upgrader_name, + bytecode_func_index=bytecode_function_index, + ) + ) + operator_list_in_version_map_part.append( + ONE_OPERATOR_IN_VERSION_MAP.substitute( + operator_name=op_name, + upgrader_list_in_version_map="".join(upgraders_in_version_map_part), + ) + ) + return OPERATOR_VERSION_MAP.substitute( + operator_list_in_version_map="".join(operator_list_in_version_map_part).lstrip( + "\n" + ) + ) + + +def get_upgrader_bytecode_function_to_index_map( + upgrader_dict: list[dict[str, Any]] +) -> dict[str, Any]: + upgrader_bytecode_function_to_index_map = {} + index = 0 + for upgrader_bytecode in upgrader_dict: + for upgrader_name in upgrader_bytecode.keys(): + if upgrader_name in EXCLUE_UPGRADER_SET: + continue + upgrader_bytecode_function_to_index_map[upgrader_name] = index + index += 1 + return upgrader_bytecode_function_to_index_map + + +def write_cpp(cpp_path: str, upgrader_dict: list[dict[str, Any]]) -> None: + body_parts = [] + upgrader_bytecode_function_to_index_map = ( + get_upgrader_bytecode_function_to_index_map(upgrader_dict) + ) + version_map_src = construct_version_maps(upgrader_bytecode_function_to_index_map) + all_upgrader_src_string = [] + for upgrader_bytecode in upgrader_dict: + for upgrader_name, bytecode in upgrader_bytecode.items(): + # TODO: remove the skip after these two operators schemas are fixed + if upgrader_name in EXCLUE_UPGRADER_SET: + continue + instruction_list_str = "" + constant_list_str = "" + type_list_str = "" + register_size_str = "" + operator_list_str = "" + for table_name, contents in bytecode.items(): + element = ByteCode[table_name] + body_string = "" + if element is ByteCode.instructions: + instruction_list_str = construct_instruction(contents) + elif element is ByteCode.constants: + constant_list_str = construct_constants(contents) + elif element is ByteCode.operators: + operator_list_str = construct_operators(contents) + elif element is ByteCode.types: + type_list_str = construct_types(contents) + elif element is ByteCode.register_size: + register_size_str = construct_register_size(contents) + + one_upgrader_function_string = ONE_UPGRADER_FUNCTION.substitute( + upgrader_name=upgrader_name, + instruction_list=instruction_list_str, + constant_list=constant_list_str, + type_list=type_list_str, + register_size=register_size_str, + ) + one_upgrader_src_string = ONE_UPGRADER_SRC.substitute( + bytecode_function=one_upgrader_function_string.lstrip("\n"), + operator_string_list=operator_list_str.lstrip("\n"), + ) + all_upgrader_src_string.append(one_upgrader_src_string) + + upgrader_file_content = UPGRADER_CPP_SRC.substitute( + operator_version_map=version_map_src, + upgrader_bytecode="".join(all_upgrader_src_string).lstrip("\n"), + ) + body_parts.append(upgrader_file_content) + print("writing file to : ", cpp_path + "/" + UPGRADER_MOBILE_FILE_NAME) + with open(os.path.join(cpp_path, UPGRADER_MOBILE_FILE_NAME), "wb") as out_file: + final_output = "".join(body_parts) + out_file.write(upgrader_file_content.encode("utf-8")) + + +def sort_upgrader(upgrader_list: list[dict[str, Any]]) -> list[dict[str, Any]]: + sorted_upgrader_list = sorted( + upgrader_list, key=lambda one_upgrader: next(iter(one_upgrader)) + ) + return sorted_upgrader_list + + +def main() -> None: + upgrader_list = generate_upgraders_bytecode() + sorted_upgrader_list = sort_upgrader(upgrader_list) + for up in sorted_upgrader_list: + print("after sort upgrader : ", next(iter(up))) + + pytorch_dir = Path(__file__).resolve().parents[2] + upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "mobile" + write_cpp(str(upgrader_path), sorted_upgrader_list) + + +if __name__ == "__main__": + main() diff --git a/torchgen/operator_versions/gen_mobile_upgraders_constant.py b/torchgen/operator_versions/gen_mobile_upgraders_constant.py new file mode 100644 index 00000000000..04b5ad887e5 --- /dev/null +++ b/torchgen/operator_versions/gen_mobile_upgraders_constant.py @@ -0,0 +1,7 @@ +MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/** + * @generated + * This is an auto-generated file. Please do not modify it by hand. + * To re-generate, please run: + * cd ~/pytorch && python torchgen/operator_versions/gen_mobile_upgraders.py + */ +""" diff --git a/torchgen/packaged/ATen/native/native_functions.yaml b/torchgen/packaged/ATen/native/native_functions.yaml new file mode 100644 index 00000000000..d23abffea99 --- /dev/null +++ b/torchgen/packaged/ATen/native/native_functions.yaml @@ -0,0 +1,14910 @@ +# See README.md in this directory for more guidance + +# *********NB: _cast_* operators are DEPRECATED and will be removed +# eventually. These were previously used before TorchScript IR supported +# representing ScalarType's. They are now superseded by usage of +# `aten::to()`. The ops remain here for backward compatibility purposes. + +# DEPRECATED. DO NOT USE +- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# DEPRECATED. DO NOT USE +- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor + variants: function + +# Computes the gradient of current tensor w.r.t. graph leaves. +- func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> () + manual_cpp_binding: True + variants: method + +# DEPRECATED. Sets the tensor data held by this `Variable` to be the same as +# `new_data`. It requires that `new_data` and `Variable` have compatible tensor +# type, by checking `_has_compatible_shallow_copy_type(this, new_data)`. +# +# This function is deprecated because it doesn't really make sense in a world +# where Variables *are* Tensors (as opposed to them containing tensors, which +# is what the previous interpretation was.) +- func: set_data(Tensor(a!) self, Tensor new_data) -> () + manual_cpp_binding: True + variants: method + +- func: data(Tensor self) -> Tensor + manual_cpp_binding: True + variants: method + +# True if this `Variable` is a leaf and thus does not have a `grad_fn`. +- func: is_leaf(Tensor self) -> bool + manual_cpp_binding: True + variants: method + +# Returns the output index of this variable from the forward operation that +# produced it. Conversely, it returns the input index of the gradient `Node` to +# which this `Variable` is connected (because in the gradient computation, +# inputs and outputs switch meaning). For example: +# +# y0, y1, y2 = f(x) +# assert y0.output_nr == 0 +# assert y1.output_nr == 1 +# assert y2.output_nr == 2 +# +- func: output_nr(Tensor self) -> int + manual_cpp_binding: True + variants: method + +- func: _version(Tensor self) -> int + manual_cpp_binding: True + variants: method + +- func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!) + manual_cpp_binding: True + variants: method + +# Enables .grad attribute for non-leaf Tensors. +- func: retain_grad(Tensor(a!) self) -> () + manual_cpp_binding: True + variants: method + +- func: retains_grad(Tensor self) -> bool + manual_cpp_binding: True + variants: method + +- func: _fw_primal(Tensor(a) self, int level) -> Tensor(a) + variants: method + dispatch: + CompositeExplicitAutograd: _fw_primal + +- func: _make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a) + variants: function + dispatch: + CompositeExplicitAutograd: _make_dual + +- func: _unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent) + variants: function + +# NOTE: [_new_zeros_with_same_feature_meta] +# This function creates a new tensor with the layout and TensorOptions +# of `other` but also takes into account the batch dimensions of `self` +# +# This function has a couple extra constraints because it is also used for `jvp` +# in functorch. +# - is used for forward AD because there is the restriction +# that the primal and tangent must have the same layout +# - We cannot assume that `self` and `other` have the same sizes or even dim +# because in the inplace over view case, `other` is the base tensor, and +# `self` is the forward grad with respect to the view, which can have an +# entirely different shape +# - takes the number of batch dims for `self` because we also handle +# some batching logic. We handle that here instead of a batching rule because +# we'd like to avoid calling as_strided in the batching rule (as to enable +# nested vmap in functorch). +# - needs to be CompositeExplicitAutograd for jvp support in functorch. +# functorch currently relies on TensorWrapper which does not have storage +# CompositeExplicitAutograd makes sure the TensorWrapper is unwrapped. +# - this function may eventually take on another int argument to store the +# the number of batch dims for other once we support that use case +- func: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _new_zeros_with_same_feature_meta + autogen: _new_zeros_with_same_feature_meta.out + +# This function compares the storage numel of self with that of other, where +# storage numel is cumputed as: `other.storage().nbytes() / other.itemsize()`. +# We create this function for composite compliance purposes. The batching rule +# always returns true because vmapped as_strided does not support accessing +# storage locations not indexable by the input tensor. +# See the note above for more information. +- func: _has_same_storage_numel(Tensor self, Tensor other) -> bool + variants: function + dispatch: + CompositeExplicitAutograd: _has_same_storage_numel + +- func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!) + variants: method + tags: inplace_view + +- func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a) + variants: method + +- func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a) + variants: method + +- func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a) + variants: method + +- func: align_as(Tensor self, Tensor other) -> Tensor + variants: method + +- func: align_tensors(Tensor[] tensors) -> Tensor[] + +# Not assert because it's a keyword; not Assert because FX already +# took that syntax +# TODO: need to specify this is side-effectful somehow +- func: _assert_async(Tensor self) -> () + dispatch: + CPU: _assert_async_cpu + CUDA: _assert_async_cuda + +- func: _assert_async.msg(Tensor self, str assert_msg) -> () + dispatch: + CPU: _assert_async_msg_cpu + CUDA: _assert_async_msg_cuda + +- func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> () + +- func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a) + variants: method + +- func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool + device_check: NoCheck # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss + dispatch: + CUDA: _use_cudnn_ctc_loss + +- func: _use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool + device_check: NoCheck # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss + dispatch: + CUDA: _use_cudnn_ctc_loss_tensor + +- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) + device_check: NoCheck # log_probs is expected to be on CUDA while targets is expected to be on CPU + dispatch: + CUDA: _cudnn_ctc_loss + autogen: _cudnn_ctc_loss.out + +- func: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) + device_check: NoCheck # log_probs is expected to be on CUDA while targets is expected to be on CPU + dispatch: + CUDA: _cudnn_ctc_loss_tensor + +- func: _use_cudnn_rnn_flatten_weight() -> bool + +- func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor + dispatch: + CUDA: _cudnn_rnn_flatten_weight + autogen: _cudnn_rnn_flatten_weight.out + +- func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + # rnn_tanh may or may not redispatch to _cudnn_rnn based on algorithm and build. Thus it might hit dispatch or kernel device check. + # Disable dispatch time device check for consistent behavior. + device_check: NoCheck + dispatch: + CUDA: _cudnn_rnn + autogen: _cudnn_rnn.out + tags: nondeterministic_seeded + +- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) + dispatch: + CUDA: _cudnn_rnn_backward + autogen: _cudnn_rnn_backward.out + +- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CUDA: _cudnn_init_dropout_state + autogen: _cudnn_init_dropout_state.out + tags: nondeterministic_seeded + +- func: _debug_has_internal_overlap(Tensor self) -> int + variants: function + +- func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: fused_dropout_cuda + tags: nondeterministic_seeded + autogen: _fused_dropout.out + +- func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor + variants: function + dispatch: + CUDA: masked_scale_cuda + autogen: _masked_scale.out + +- func: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: native_dropout_cpu + CUDA: native_dropout_cuda + NestedTensorCPU, NestedTensorCUDA: native_dropout_nested + tags: [nondeterministic_seeded, core] + autogen: native_dropout.out + +- func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor + dispatch: + CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward + CUDA: native_dropout_backward_cuda + autogen: native_dropout_backward.out + tags: pointwise + +- func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) + +- func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) + +- func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!) + +- func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!) + +- func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor + +- func: _shape_as_tensor(Tensor self) -> Tensor + +- func: dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: feature_dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: alpha_dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor + tags: nondeterministic_seeded + +- func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: abs(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: abs + SparseCPU, SparseCUDA: abs_sparse + SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs + tags: [core, pointwise] + +- func: abs_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: abs_ + SparseCPU, SparseCUDA: abs_sparse_ + SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_ + +- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: abs_out + MPS: abs_out_mps + SparseCPU, SparseCUDA: abs_sparse_out + SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out + tags: pointwise + +# Note [Adding an alias] +# To add an alias do the following: +# +# 1) Copy the original functions native_functions.yaml entry, but replace the +# original function's name with their own and delete any dispatch +# keys for the aliases. Specifying a dispatch key will prevent +# autograd from recording the operations the alias performs, which +# will stop it from "inheriting" the original operation's autograd behavior. +# 2) Implement the corresponding functions and have them redispatch to the +# original function. +# 3) Add docstrings to the new function that reference the original function, +# and document the method as usual (if it exists.) +# (See torch/_torch_docs.py and docs/source/torch.rst if adding a function, +# torch/_tensor_docs.py and docs/source/tensors.rst if adding a method, +# or module-specific doc bindings (like torch/linalg/__init__.py) if +# adding an alias in a namespace.) +# 4) Update torch/overrides.py consistent with the original function. +# 5) Update the alias_map in torch/csrc/jit/passes/normalize_ops.cpp. +# 6) Add aliases argument to existing OpInfo/UnaryUfuncInfo or create new OpInfo/UnaryUfuncInfo entry +# in op_db list in torch/testing/_internal/common_methods_invocations.py +# +# See torch.absolute, an alias for torch.abs, as an example. +# Absolute, alias for abs + +- func: absolute(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: absolute_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: angle(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: angle + SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr + tags: pointwise + +- func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: angle_out + SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out + tags: pointwise + +- func: view_as_real(Tensor(a) self) -> Tensor(a) + variants: function + dispatch: + CPU, CUDA, MPS, Meta: view_as_real + +- func: view_as_complex(Tensor(a) self) -> Tensor(a) + variants: function + dispatch: + CPU, CUDA, Meta: view_as_complex + +- func: sgn(Tensor self) -> Tensor + variants: function, method + structured_delegate: sgn.out + dispatch: + SparseCPU, SparseCUDA: sgn_sparse + SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn + tags: pointwise + +- func: sgn_(Tensor(a!) self) -> Tensor(a!) + variants: method + structured_delegate: sgn.out + dispatch: + SparseCPU, SparseCUDA: sgn_sparse_ + SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_ + tags: pointwise + +- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sgn_out + SparseCPU, SparseCUDA: sgn_sparse_out + SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out + tags: pointwise + +- func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + variants: method + +- func: real(Tensor(a) self) -> Tensor(a) + device_check: NoCheck # TensorIterator + variants: function + +- func: imag(Tensor(a) self) -> Tensor(a) + device_check: NoCheck # TensorIterator + variants: function + +- func: _conj(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: _conj + +- func: conj(Tensor(a) self) -> Tensor(a) + variants: function, method + manual_cpp_binding: True + +- func: _conj_physical(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: _conj_physical + SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr + autogen: _conj_physical.out + +- func: conj_physical(Tensor self) -> Tensor + variants: function, method + tags: pointwise + +- func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: conj_physical_out + SparseCPU, SparseCUDA: conj_physical_out_sparse + SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out + tags: pointwise + +- func: conj_physical_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: conj_physical_ + SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_ + tags: pointwise + +- func: resolve_conj(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: resolve_neg(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: _neg_view(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: _neg_view + +- func: acos(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: acos.out + tags: [core, pointwise] + +- func: acos_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: acos.out + tags: pointwise + +- func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: acos_out + MPS: acos_out_mps + tags: pointwise + +# arccos, alias of acos +- func: arccos(Tensor self) -> Tensor + variants: function, method + +- func: arccos_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor + +- func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor) + +- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: add.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: add_sparse + SparseCsrCPU, SparseCsrCUDA: add_sparse_csr + MkldnnCPU: mkldnn_add + ZeroTensor: add_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor + tags: [core, pointwise] + +- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: add.out + dispatch: + SparseCPU, SparseCUDA: add_sparse_ + SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_ + MkldnnCPU: mkldnn_add_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor + tags: pointwise + +- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + ufunc_inner_loop: + Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf) + ScalarOnly: add (Bool) + dispatch: + SparseCPU: add_out_sparse_cpu + SparseCUDA: add_out_sparse_cuda + SparseCsrCPU: add_out_sparse_csr_cpu + SparseCsrCUDA: add_out_sparse_csr_cuda + MkldnnCPU: mkldnn_add_out + MPS: add_out_mps + tags: pointwise + +- func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_relu + +- func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_ + +- func: _add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_out + +- func: _add_relu.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_relu + +- func: _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_ + autogen: _add_relu.Scalar_out + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: add + tags: [core, pointwise] + +- func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: add_ + autogen: add.Scalar_out + tags: pointwise + +- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor + structured_delegate: addmv.out + variants: function, method + +- func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + structured_delegate: addmv.out + variants: function, method + +- func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmv_out_cpu + CUDA: addmv_out_cuda + MPS: addmv_out_mps + SparseCsrCPU: addmv_out_sparse_compressed + SparseCsrCUDA: addmv_out_sparse_compressed_cuda + +- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: addr + MPS: addr_mps + CompositeExplicitAutograd: math_addr + +- func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: addr_ + +- func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addr_out + MPS: addr_out_mps + CompositeExplicitAutograd: math_addr_out + +- func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: affine_grid_generator + autogen: affine_grid_generator.out + +- func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor + variants: function + +- func: _is_all_true(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: _is_all_true + +- func: _is_any_true(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: _is_any_true + +# Note: this function is only for testing. +- func: _test_check_tensor(Tensor self) -> Tensor + variants: function + +# Note; this function is only for testing +- func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor + variants: function + dispatch: + CPU: _test_functorch_fallback + autogen: _test_functorch_fallback.out + +- func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: all.out + variants: function, method + +- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: all_out + MPS: all_out_mps + +- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool + variants: function, method + tags: data_dependent_output + dispatch: + CompositeExplicitAutograd: allclose + +- func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: any.out + variants: function, method + +- func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: any_out + MPS: any_out_mps + +- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: arange + +- func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: arange + +# This operator should be named `aragne.start_out` if following the naming convention. However that +# name is already taken. Disabled because of CI job failures. +# FIXME: enable this +#- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!) +# dispatch: +# CompositeExplicitAutograd: arange_start_out + +- func: arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: arange + cpp_no_default_args: ['step'] + tags: core + +- func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: arange_out + +- func: arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: arange_out + CUDA: arange_cuda_out + MPS: arange_mps_out + cpp_no_default_args: ['step'] + +# This function is a temporary hack to allow tracing of arange like constructs with dynamic +# bounds on arange. Normal arange is not traceable because it does not take any tensor inputs; +# if the range you need is based on another tensor, calling this function directly will +# preserve tracing. Get rid of this when arange can directly take tensors for bounds +# (so that it can be traced directly). +- func: _dim_arange(Tensor like, int dim) -> Tensor + +- func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor + structured_delegate: argmax.out + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + +- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: argmax_out + MPS: argmax_out_mps + +- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor + structured_delegate: argmin.out + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + +- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: argmin_out + MPS: argmin_out_mps + +- func: acosh(Tensor self) -> Tensor + variants: function, method + structured_delegate: acosh.out + tags: [core, pointwise] + +- func: acosh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + structured_delegate: acosh.out + tags: pointwise + +- func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: acosh_out + MPS: acosh_out_mps + tags: pointwise +# arccosh, alias for acosh + +- func: arccosh(Tensor self) -> Tensor + variants: function, method + +- func: arccosh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: asinh(Tensor self) -> Tensor + variants: function, method + structured_delegate: asinh.out + dispatch: + SparseCPU, SparseCUDA: asinh_sparse + SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr + tags: [core, pointwise] + +- func: asinh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + structured_delegate: asinh.out + dispatch: + SparseCPU, SparseCUDA: asinh_sparse_ + SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_ + tags: pointwise + +- func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: asinh_out + MPS: asinh_out_mps + SparseCPU, SparseCUDA: asinh_sparse_out + SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out + tags: pointwise + +# arcsinh, alias for asinh +- func: arcsinh(Tensor self) -> Tensor + variants: function, method + +- func: arcsinh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: atanh(Tensor self) -> Tensor + structured_delegate: atanh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atanh_sparse + SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr + tags: [core, pointwise] + +- func: atanh_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: atanh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atanh_sparse_ + SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_ + tags: pointwise + +- func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: atanh_out + MPS: atanh_out_mps + SparseCPU, SparseCUDA: atanh_sparse_out + SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out + tags: pointwise +# arctanh, alias for atanh + +- func: arctanh(Tensor self) -> Tensor + variants: function, method + +- func: arctanh_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a) + variants: function, method + dispatch: + ZeroTensor, CPU, CUDA: as_strided_tensorimpl + Meta: as_strided_tensorimpl_meta_symint + MPS: as_strided_tensorimpl_mps + QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl + device_check: NoCheck + device_guard: False + tags: core + +- func: as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function, method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutogradNonFunctional: as_strided__symint + +- func: asin(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: asin.out + dispatch: + SparseCPU, SparseCUDA: asin_sparse + SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr + tags: [core, pointwise] + +- func: asin_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: asin.out + dispatch: + SparseCPU, SparseCUDA: asin_sparse_ + SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_ + tags: pointwise + +- func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: asin_out + MPS: asin_out_mps + SparseCPU, SparseCUDA: asin_sparse_out + SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out + tags: pointwise + +# arcsin, alias of asin +- func: arcsin(Tensor self) -> Tensor + variants: function, method + +- func: arcsin_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: atan(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: atan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atan_sparse + SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr + tags: [core, pointwise] + +- func: atan_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: atan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: atan_sparse_ + SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_ + tags: pointwise + +- func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: atan_out + MPS: atan_out_mps + SparseCPU, SparseCUDA: atan_sparse_out + SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out + tags: pointwise + +# arctan, alias of atan +- func: arctan(Tensor self) -> Tensor + variants: function, method + +- func: arctan_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: atleast_1d(Tensor self) -> Tensor + variants: function + +- func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[] + +- func: atleast_2d(Tensor self) -> Tensor + variants: function + +- func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[] + variants: function + +- func: atleast_3d(Tensor self) -> Tensor + variants: function + +- func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[] + variants: function + +- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + structured_delegate: baddbmm.out + +- func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + variants: method + structured_delegate: baddbmm.out + +- func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU: baddbmm_out_cpu + CUDA: baddbmm_out_cuda + MPS: baddbmm_out_mps + SparseCsrCUDA: baddbmm_out_sparse_csr_cuda + +- func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: bartlett_window + autogen: bartlett_window.out + +- func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: bartlett_window + autogen: bartlett_window.periodic_out + +- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor + +- func: quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor + dispatch: + QuantizedCPU: quantized_batch_norm + autogen: quantized_batch_norm.out + +- func: _batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int) + +- func: _batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor) + +# Sample bernoulli with values in `self` as probability. +- func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: bernoulli + tags: nondeterministic_seeded + +- func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: bernoulli_out + MPS: bernoulli_out_mps + +- func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ + autogen: bernoulli.Tensor, bernoulli.Tensor_out + +- func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ + autogen: bernoulli.float_out + +# Note [bernoulli.p schema] +# We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking) +# This out-of-place version isn't used explicitly, but needed by jit. +# There is no default valid on `p` here because it would introduce ambiguity +# with `bernoulli(Tensor self, *, Generator? generator=None)` declaration. +- func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutogradNonFunctional: bernoulli + +- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor + +- func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_cpu + CUDA: binary_cross_entropy_cuda + MPS: binary_cross_entropy_mps + +- func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_out_cpu + CUDA: binary_cross_entropy_out_cuda + MPS: binary_cross_entropy_out_mps + +- func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_backward_cpu + CUDA: binary_cross_entropy_backward_cuda + MPS: binary_cross_entropy_backward_mps + +- func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + variants: function + dispatch: + CPU: binary_cross_entropy_backward_out_cpu + CUDA: binary_cross_entropy_backward_out_cuda + MPS: binary_cross_entropy_backward_out_mps + +- func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: binary_cross_entropy_with_logits + autogen: binary_cross_entropy_with_logits.out + +- func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor + variants: function, method + dispatch: + CPU: _bincount_cpu + CUDA: _bincount_cuda + MPS: _bincount_mps + tags: dynamic_output_shape + autogen: bincount.out + +- func: bitwise_not(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: bitwise_not.out + variants: function, method + tags: [core, pointwise] + +- func: bitwise_not_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: bitwise_not.out + variants: method + tags: pointwise + +- func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_not_out + MPS: bitwise_not_out_mps + tags: pointwise + +- func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA, MPS: copysign_out + tags: pointwise + +- func: copysign.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: copysign.out + tags: pointwise + +- func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: copysign.out + +- func: copysign.Scalar(Tensor self, Scalar other) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: copysign + tags: pointwise + +- func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: copysign_ + +- func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: copysign_out + tags: pointwise + +- func: logical_not(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_not + NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not + tags: [core, pointwise] + +- func: logical_not_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_not_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_ + tags: pointwise + +- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_not_out + MPS: logical_not_out_mps + tags: pointwise + +- func: logical_xor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_xor + tags: pointwise + +- func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_xor_ + tags: pointwise + +- func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_xor_out + MPS: logical_xor_out_mps + tags: pointwise + +- func: logical_and(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_and + tags: [core, pointwise] + +- func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_and_ + tags: pointwise + +- func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_and_out + MPS: logical_and_out_mps + tags: pointwise + +- func: logical_or(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logical_or + tags: [core, pointwise] + +- func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: logical_or_ + tags: pointwise + +- func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: logical_or_out + MPS: logical_or_out_mps + tags: pointwise + +- func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: blackman_window + autogen: blackman_window.out + +- func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: blackman_window + autogen: blackman_window.periodic_out + +- func: bmm(Tensor self, Tensor mat2) -> Tensor + structured_delegate: bmm.out + variants: function, method + dispatch: + SparseCPU: bmm_sparse_cpu + SparseCUDA: bmm_sparse_cuda + NestedTensorCPU: bmm_nested + NestedTensorCUDA: bmm_nested_cuda + tags: core + +- func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU: bmm_out_cpu + CUDA: bmm_out_cuda + MPS: bmm_out_mps + SparseCPU: bmm_out_sparse_cpu + SparseCUDA: bmm_out_sparse_cuda + SparseCsrCUDA: bmm_out_sparse_csr_cuda + +- func: broadcast_tensors(Tensor[] tensors) -> Tensor[] + device_check: NoCheck + device_guard: False + +- func: broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a) + variants: function, method + dispatch: + CompositeImplicitAutograd: broadcast_to_symint + +- func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) + variants: function + dispatch: + SparseCPU, SparseCUDA: sparse_broadcast_to + +- func: cat(Tensor[] tensors, int dim=0) -> Tensor + structured_delegate: cat.out + dispatch: + SparseCPU, SparseCUDA: cat_sparse + QuantizedCPU: cat_quantized_cpu + tags: core + +- func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + precomputed: + - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format + dispatch: + CPU: cat_out_cpu + CUDA: cat_out_cuda + MPS: cat_out_mps + QuantizedCPU: cat_out_quantized_cpu + +- func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +# alias for torch.cat +- func: concat(Tensor[] tensors, int dim=0) -> Tensor + +- func: concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: concat.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +# alias for torch.cat +- func: concatenate(Tensor[] tensors, int dim=0) -> Tensor + +- func: concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +- func: block_diag(Tensor[] tensors) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: block_diag + autogen: block_diag.out + +- func: ceil(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: ceil.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: ceil_sparse + SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr + tags: pointwise + +- func: ceil_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: ceil.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: ceil_sparse_ + SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_ + tags: pointwise + +- func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: ceil_out + MPS: ceil_out_mps + SparseCPU, SparseCUDA: ceil_sparse_out + SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out + tags: pointwise + +# alias for torch.linalg.multi_dot +- func: chain_matmul(Tensor[] matrices) -> Tensor + variants: function + +# alias for torch.linalg.multi_dot +- func: chain_matmul.out(Tensor[] matrices, *, Tensor(a!) out) -> Tensor(a!) + +- func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[] + variants: function, method + device_check: NoCheck + device_guard: False + +- func: chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: chunk + NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor + +- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[] + variants: function, method + dispatch: + CompositeImplicitAutograd: tensor_split_sections_symint + +- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[] + variants: function, method + dispatch: + CompositeImplicitAutograd: tensor_split_indices_symint + +- func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[] + variants: function, method + +- func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ['min'] + structured_delegate: clamp.out + dispatch: + QuantizedCPU: clamp_quantized_cpu + tags: [core, pointwise] + +- func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor + variants: function, method + structured_delegate: clamp.Tensor_out + tags: pointwise + +- func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ['min'] + structured_delegate: clamp.out + tags: pointwise + +- func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!) + variants: function, method + structured_delegate: clamp.Tensor_out + tags: pointwise + +- func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ['min'] + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_out + MPS: clamp_out_mps + tags: pointwise + +- func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_Tensor_out + MPS: clamp_Tensor_out_mps + tags: pointwise + +- func: clamp_max(Tensor self, Scalar max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_max.out + tags: pointwise + +- func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor + variants: function, method + structured_delegate: clamp_max.Tensor_out + tags: pointwise + +- func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_max.out + tags: pointwise + +- func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!) + variants: function, method + structured_delegate: clamp_max.Tensor_out + tags: pointwise + +- func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_max_out + MPS: clamp_max_out_mps + tags: pointwise + +- func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_max_Tensor_out + MPS: clamp_max_Tensor_out_mps + tags: pointwise + +- func: clamp_min(Tensor self, Scalar min) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_min.out + tags: pointwise + +- func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor + variants: function, method + structured_delegate: clamp_min.Tensor_out + tags: pointwise + +- func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: clamp_min.out + tags: pointwise + +- func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!) + variants: function, method + structured_delegate: clamp_min.Tensor_out + tags: pointwise + +- func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_min_out + MPS: clamp_min_out_mps + tags: pointwise + +- func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: clamp_min_Tensor_out + MPS: clamp_min_Tensor_out_mps + tags: pointwise + +# clip is an alias for clamp +- func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor + cpp_no_default_args: ['min'] + variants: function, method + tags: pointwise + +- func: clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor + variants: function, method + tags: pointwise + +- func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) + cpp_no_default_args: ['min'] + variants: function, method + tags: pointwise + +- func: clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!) + variants: function, method + tags: pointwise + +- func: clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + cpp_no_default_args: ['min'] + tags: pointwise + +- func: clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!) + +- func: cudnn_is_acceptable(Tensor self) -> bool + device_check: NoCheck + device_guard: False + +- func: complex(Tensor real, Tensor imag) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: complex + +- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: complex_out + +- func: polar(Tensor abs, Tensor angle) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: polar + +- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polar_out + +- func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: constant_pad_nd + MPS: constant_pad_nd_mps + autogen: constant_pad_nd.out + tags: core + +- func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) + variants: method + manual_cpp_binding: True + +- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor + dispatch: + CompositeExplicitAutograd: convolution + autogen: convolution.out + tags: core + +- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd, CUDA: convolution_backward + autogen: convolution_backward.out + tags: core + +- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor + dispatch: + CompositeExplicitAutograd: convolution_overrideable + autogen: convolution_overrideable.out + +- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) + dispatch: + CompositeExplicitAutograd: convolution_backward_overrideable + autogen: convolution_backward_overrideable.out + +- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor + dispatch: + CompositeExplicitAutograd: _convolution + autogen: _convolution.out + +- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor + +- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor + +- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + +- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv1d_symint + +- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv2d_symint + +- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv3d_symint + +- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor + cpp_no_default_args: ['bias', 'stride', 'padding'] + +- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, str padding="valid", int[2] dilation=1, int groups=1) -> Tensor + cpp_no_default_args: ['bias', 'stride', 'padding'] + +- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, str padding="valid", int[3] dilation=1, int groups=1) -> Tensor + cpp_no_default_args: ['bias', 'stride', 'padding'] + +- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor + dispatch: + CompositeExplicitAutograd: conv_tbc + autogen: conv_tbc.out + +- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor) + +# NB: we inherit the goofy argument order from PyTorch torch.nn.functional +- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose1d_symint + +- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose2d_symint + +- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose3d_symint + +- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: copy + +- func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: copy_mkldnn_ + SparseCPU, SparseCUDA: copy_sparse_wrapper_ + CompositeExplicitAutograd: copy_ + SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_ + NestedTensorCPU, NestedTensorCUDA: copy_nested_ + autogen: copy.out + +- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor + dispatch: + MPS: _copy_from_mps + autogen: _copy_from.out + +# We need this to be able to properly copy from a CPU to an XLA tensor with different sizes. +# See https://github.com/pytorch/xla/issues/2881 +- func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor + dispatch: + MPS: _copy_from_and_resize_mps + autogen: _copy_from_and_resize.out + +- func: cos(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cos.out + tags: [core, pointwise] + +- func: cos_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cos.out + tags: pointwise + +- func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: cos_out + MPS: cos_out_mps + tags: pointwise + +- func: cosh(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cosh.out + tags: [core, pointwise] + +- func: cosh_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: cosh.out + tags: pointwise + +- func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: cosh_out + MPS: cosh_out_mps + tags: pointwise + +- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor + +- func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor + variants: function, method + dispatch: + CPU: count_nonzero_cpu + CUDA: count_nonzero_cuda + MPS: count_nonzero_mps + autogen: count_nonzero.dim_IntList_out + +- func: count_nonzero(Tensor self, int? dim=None) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: count_nonzero + autogen: count_nonzero.out + +- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor + variants: function, method + +- func: corrcoef(Tensor self) -> Tensor + variants: function, method + +- func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid + dispatch: + CUDA: cudnn_affine_grid_generator_forward + autogen: cudnn_affine_grid_generator.out + +# TODO: Why do I have to call this grad?! +- func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta + dispatch: + CUDA: cudnn_affine_grid_generator_backward + autogen: cudnn_affine_grid_generator_backward.out + +- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: cudnn_batch_norm + autogen: cudnn_batch_norm.out + +# NB: You can only use this if you used cudnn_batch_norm training=True +- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: cudnn_batch_norm_backward + autogen: cudnn_batch_norm_backward.out + +- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor + dispatch: + CUDA: cudnn_convolution + autogen: cudnn_convolution.out + +- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor + dispatch: + CUDA: cudnn_convolution_transpose + autogen: cudnn_convolution_transpose.out + +- func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor + dispatch: + MPS: _mps_convolution_transpose + autogen: _mps_convolution_transpose.out + +- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + MPS: mps_convolution_transpose_backward + autogen: mps_convolution_transpose_backward.out + +- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor + dispatch: + CUDA: cudnn_convolution_relu + autogen: cudnn_convolution_relu.out + +- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor + dispatch: + CUDA: cudnn_convolution_add_relu + autogen: cudnn_convolution_add_relu.out + +# NB: input is special cased in a way I don't quite understand +- func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output + dispatch: + CUDA: cudnn_grid_sampler_forward + autogen: cudnn_grid_sampler.out + +- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid) + dispatch: + CUDA: cudnn_grid_sampler_backward + autogen: cudnn_grid_sampler_backward.out + +- func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: cummax + +- func: cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: cummax_out + +- func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: _cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> () + variants: function + dispatch: + CPU: cummax_helper_cpu + CUDA: cummax_helper_cuda + +- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: cummin + +- func: cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: cummin_out + +- func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: _cummin_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> () + variants: function + dispatch: + CPU: cummin_helper_cpu + CUDA: cummin_helper_cuda + +- func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + structured_delegate: cumprod.out + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) + structured_delegate: cumprod.out + variants: method + +- func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: cumprod_out + +- func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) + variants: method + +- func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: cumprod_backward(Tensor grad, Tensor input, int dim, Tensor output) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + structured_delegate: cumsum.out + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) + structured_delegate: cumsum.out + variants: method + +- func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: cumsum_out + MPS: cumsum_out_mps + +- func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) + variants: method + +- func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: cumulative_trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: cumulative_trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor + +- func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor + +# convenience function that converts to intlists for you +- func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor + +- func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) + dispatch: + CPU: ctc_loss_cpu + CUDA: ctc_loss_gpu + autogen: _ctc_loss.out + tags: dynamic_output_shape # the shape of second output is data dependent + +- func: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: ctc_loss_tensor + autogen: _ctc_loss.Tensor_out + tags: dynamic_output_shape # the shape of second output is data dependent + +- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor + dispatch: + CPU: ctc_loss_backward_cpu + CUDA: ctc_loss_backward_gpu + autogen: _ctc_loss_backward.out + +- func: _ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor + dispatch: + CPU, CUDA: ctc_loss_backward_tensor + +- func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutogradNonFunctional: diag_embed + autogen: diag_embed.out + +- func: diagflat(Tensor self, int offset=0) -> Tensor + variants: function, method + +- func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: diagonal + +- func: linalg_diagonal(Tensor(a) A, *, int offset=0, int dim1=-2, int dim2=-1) -> Tensor(a) + python_module: linalg + variants: function + +- func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a) + variants: function, method + +- func: diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: diagonal_backward_symint + autogen: diagonal_backward.out + +- func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!) + variants: method + +- func: diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor + variants: function, method + +- func: diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +- func: gradient.scalarint(Tensor self, *, Scalar? spacing=None, int? dim=None, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.scalararray(Tensor self, *, Scalar spacing, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.array(Tensor self, *, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.scalarrayarray(Tensor self, *, Scalar[] spacing, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.tensorarrayint(Tensor self, *, Tensor[] spacing, int? dim=None, int edge_order=1) -> Tensor[] + variants: function + +- func: gradient.tensorarray(Tensor self, *, Tensor[] spacing, int[] dim, int edge_order=1) -> Tensor[] + variants: function + +- func: div.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: div.out + dispatch: + SparseCPU, SparseCUDA: div_sparse + ZeroTensor: div_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor + tags: [core, pointwise] + +- func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: div.out + dispatch: + SparseCPU, SparseCUDA: div_sparse_ + tags: pointwise + +- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: div_out + MPS: div_out_mps + SparseCPU, SparseCUDA: div_out_sparse_zerodim + tags: pointwise + +- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: div.out_mode + dispatch: + SparseCPU, SparseCUDA: div_sparse + tags: pointwise + +- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: div.out_mode + dispatch: + SparseCPU, SparseCUDA: div_sparse_ + tags: pointwise + +- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: div_out_mode + MPS: div_out_mode_mps + SparseCPU, SparseCUDA: div_out_sparse_zerodim + tags: pointwise + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: div.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: div + NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar + tags: [core, pointwise] + +- func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: div_ + autogen: div.Scalar_out + tags: pointwise + +- func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: div + tags: pointwise + +- func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: div_ + autogen: div.Scalar_mode_out + tags: pointwise + +# divide, alias for div +- func: divide.Tensor(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: divide.Scalar(Tensor self, Scalar other) -> Tensor + variants: function, method + +- func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor + variants: function, method + +- func: divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!) + variants: method + +- func: divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) + +- func: divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor + variants: function, method + +- func: divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!) + variants: method + + # true_divide, an alias for div +- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: pointwise + +- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: dot(Tensor self, Tensor tensor) -> Tensor + variants: function, method + dispatch: + CPU: dot + CUDA: dot_cuda + MPS: dot_mps + +- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: dot_out + +- func: vdot(Tensor self, Tensor other) -> Tensor + variants: function, method + dispatch: + CPU: vdot + CUDA: vdot_cuda + +- func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: vdot_out + +- func: einsum(str equation, Tensor[] tensors, *, int[]? path=None) -> Tensor + +- func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor + dispatch: + CompositeExplicitAutograd: embedding_symint + NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding + autogen: embedding.out + +- func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor + dispatch: + CompositeImplicitAutograd: embedding_backward_symint + +- func: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor + dispatch: + CPU: embedding_dense_backward_cpu + CUDA: embedding_dense_backward_cuda + MPS: embedding_dense_backward_mps + autogen: embedding_dense_backward.out + tags: core + +- func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) + dispatch: + CPU: embedding_renorm_cpu_ + CUDA: embedding_renorm_cuda_ + autogen: embedding_renorm, embedding_renorm.out + +- func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor + +# NOTE [ embedding_bag Native Functions ] +# The `_embedding_bag.*` variants assume that input tensors except for `weight`, +# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous. +# We really only need to enforce this for `_embedding_bag` (the forward) because +# the backward inputs are the same as forward ones. +# The above `embedding_bag` wrapper is created to achieve this, e.g., +# applying indices = indices.contiguous(). +# The backward functions apply a check that these input tensors are contiguous. + + +- func: _embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: _embedding_bag_forward_only_cpu + CUDA: _embedding_bag_forward_only_cuda + autogen: _embedding_bag_forward_only.out + +- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor) + +# row_stack is the alias of vstack +- func: row_stack(Tensor[] tensors) -> Tensor + +- func: row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor) + +# To keep backward and forward compatibility, and to avoid ambiguity with the +# original signature above, scale_grad_by_freq, mode, sparse, +# per_sample_weights, and include_last_offset parameters do not have default +# values. Once the original signature is removed, default values can be added. +- func: embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor, Tensor) + +- func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: _embedding_bag_cpu + CUDA: _embedding_bag_cuda + autogen: _embedding_bag.out + +- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor + dispatch: + CompositeImplicitAutograd: _embedding_bag_backward_symint + +- func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor + dispatch: + CompositeImplicitAutograd: _embedding_bag_sparse_backward_symint + +- func: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor + dispatch: + CPU: _embedding_bag_dense_backward_cpu + CUDA: _embedding_bag_dense_backward_cuda + autogen: _embedding_bag_dense_backward.out + +- func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor + dispatch: + CPU: _embedding_bag_per_sample_weights_backward_cpu + CUDA: _embedding_bag_per_sample_weights_backward_cuda + autogen: _embedding_bag_per_sample_weights_backward.out + +- func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: empty_names + autogen: empty.names_out + +- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + CPU: empty_cpu + CUDA: empty_cuda + MPS: empty_mps + Meta: empty_meta_symint + MkldnnCPU: empty_mkldnn + SparseCPU, SparseCUDA, SparseMeta: empty_sparse + SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed + QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized + tags: core + +- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: empty_permuted_symint + autogen: empty_permuted.out + +# We do not make new_empty a composite that calls into new_empty_strided, as the strided version +# is significantly more difficult to implement by different backends +- func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + CompositeExplicitAutograd: new_empty_symint + autogen: new_empty.out + +- func: new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + CompositeExplicitAutogradNonFunctional: new_empty_strided_symint + autogen: new_empty_strided.out + +- func: new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: new_full + autogen: new_full.out + +- func: new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: new_zeros + autogen: new_zeros.out + +- func: new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: new_ones + autogen: new_ones.out + +# other overrides are to provide a more helpful error message that dtype is required +- func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor + dispatch: + CPU: empty_affine_quantized_other_backends_stub + QuantizedCPU, QuantizedCUDA: empty_affine_quantized + autogen: _empty_affine_quantized.out + +# it's a factory function receiving a tensor argument, thus overriding explicitly +# other overrides are to provide a more helpful error message that dtype is required +- func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor + category_override: factory + dispatch: + CPU: empty_per_channel_affine_quantized_other_backends_stub + QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized + autogen: _empty_per_channel_affine_quantized.out + +- func: resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + Meta: resize__symint + CPU: resize_ + CUDA: resize_cuda_ + MPS: resize_mps_ + QuantizedCPU: quantized_resize_cpu_ + SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_ + autogen: resize, resize.out + +# This is a utility function to enable users to resize out tensor while registering kernels for out variants. +# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration +# to make it easy to register out variants for ops. +- func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function + dispatch: + Meta: _resize_output_ + autogen: _resize_output, _resize_output.out + +- func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + category_override: factory + variants: function + dispatch: + QuantizedCPU, QuantizedCUDA: empty_quantized + autogen: empty_quantized.out + +- func: empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + device_guard: False + +- func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: empty_like + QuantizedCPU, QuantizedCUDA: empty_like_quantized + SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo + SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr + NestedTensorCPU, NestedTensorCUDA: empty_like_nested + autogen: empty_like.out + +- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: empty_strided_cpu + CUDA: empty_strided_cuda + MPS: empty_strided_mps + Meta: empty_strided_meta_symint + QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized + autogen: empty_strided.out + tags: core + +- func: erf(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: erf.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: erf_sparse + SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr + tags: [core, pointwise] + +- func: erf_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: erf.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: erf_sparse_ + SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_ + tags: pointwise + +- func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: erf_out + MPS: erf_out_mps + SparseCPU, SparseCUDA: erf_sparse_out + SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out + tags: pointwise + +- func: erfc(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: erfc.out + variants: function, method + tags: pointwise + +- func: erfc_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: erfc.out + variants: function, method + tags: pointwise + +- func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: erfc_out + tags: pointwise + +- func: exp(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: exp.out + variants: function, method + tags: [core, pointwise] + +- func: exp_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: exp.out + variants: function, method + tags: pointwise + +- func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: exp_out + MPS: exp_out_mps + tags: pointwise + +- func: exp2(Tensor self) -> Tensor + structured_delegate: exp2.out + variants: function, method + tags: pointwise + +- func: exp2_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: exp2.out + variants: function, method + tags: pointwise + +- func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: exp2_out + MPS: exp2_out_mps + tags: pointwise + +- func: expm1(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: expm1.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: expm1_sparse + SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr + tags: pointwise + +- func: expm1_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: expm1.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: expm1_sparse_ + SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_ + tags: pointwise + +- func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: expm1_out + MPS: expm1_out_mps + SparseCPU, SparseCUDA: expm1_sparse_out + SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out + tags: pointwise + +- func: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a) + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: expand + tags: core + +- func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a) + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + device_check: NoCheck + device_guard: False + +# decomposes to eye.m +- func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: eye + +- func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: eye + +- func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: eye_out_cpu + CUDA: eye_out_cuda + MPS: eye_out_mps + +- func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: eye_out_cpu + CUDA: eye_out_cuda + MPS: eye_out_mps + +- func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) + variants: function, method + +- func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) + variants: function, method + +- func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a) + variants: function, method + +- func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) + variants: function, method + +- func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a) + variants: function, method + dispatch: + CompositeImplicitAutograd: unflatten_symint + +- func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a) + variants: function, method + dispatch: + CompositeImplicitAutograd: unflatten_dimname_symint + +- func: fill.Scalar(Tensor self, Scalar value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + tags: core + +- func: fill.Tensor(Tensor self, Tensor value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + +- func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: fill_ + MPS: fill_scalar_mps + QuantizedCPU, QuantizedCUDA: fill_quantized_ + Meta: fill_meta_ + SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: fill_nested_ + autogen: fill.Scalar_out + +- func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: fill_ + MPS: fill_tensor_mps_ + QuantizedCPU, QuantizedCUDA: fill_quantized_ + Meta: fill_meta_ + NestedTensorCPU, NestedTensorCUDA: fill_nested_ + autogen: fill.Tensor_out + +- func: floor(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: floor.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: floor_sparse + SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr + tags: [core, pointwise] + +- func: floor_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: floor.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: floor_sparse_ + SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_ + tags: pointwise + +- func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: floor_out + MPS: floor_out_mps + SparseCPU, SparseCUDA: floor_sparse_out + SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out + tags: pointwise + +- func: floor_divide(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: floor_divide + MPS: floor_divide_mps + SparseCPU, SparseCUDA: floor_divide_sparse + +- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: floor_divide_ + MPS: floor_divide_mps_ + SparseCPU, SparseCUDA: floor_divide_sparse_ + +- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: floor_divide_out + MPS: floor_divide_out_mps + SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim + +- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: frac(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: frac.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: frac_sparse + SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr + tags: pointwise + +- func: frac_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: frac.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: frac_sparse_ + SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_ + tags: pointwise + +- func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: frac_out + MPS: frac_out_mps + SparseCPU, SparseCUDA: frac_sparse_out + SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out + tags: pointwise + +- func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: full + autogen: full.names_out + +- func: full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: full + tags: core + +- func: full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: full_out + +- func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: full_like + autogen: full_like.out + +- func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: from_file + autogen: from_file.out + +- func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: gcd_out + tags: pointwise + +- func: gcd(Tensor self, Tensor other) -> Tensor + structured_delegate: gcd.out + variants: function, method + tags: pointwise + +- func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: gcd.out + variants: function, method + +- func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lcm_out + tags: pointwise + +- func: lcm(Tensor self, Tensor other) -> Tensor + structured_delegate: lcm.out + variants: function, method + tags: pointwise + +- func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: lcm.out + variants: function, method + +# NOTE [ grid_sampler Native Functions ] +# `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to +# one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of +# which has the corresponding backward defined as native functions as well. +# However, we do shape checking everywhere for now since each of the mentioned +# functions can be called directly, which will lead to crashes otherwise. +# See https://github.com/pytorch/pytorch/issues/73187 for more information. +# +# There is also _grid_sampler_2d_backward_cpu_fallback which is an +# implementation detail of grid_sampler_2d and is only exposed here for testing +# purposes. +# +# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to +# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in +# `interpolation_mode` because it only supports Bilinear interpolation mode. +# Nor does it take in `align_corners` because it only supports the mode +# `align_corners = True`. +- func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + +- func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + dispatch: + CPU, QuantizedCPU: grid_sampler_2d_cpu + CUDA: grid_sampler_2d_cuda + MPS: grid_sampler_2d_mps + autogen: grid_sampler_2d.out + tags: core + +# `grid_sampler_2d_backward` takes in `output_mask` to optimize performance for +# the case where `input` doesn't require gradient. Gradient for `grid` is always +# computed (only `output_mask[0]` is checked by the implementations). +- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + CPU: grid_sampler_2d_backward_cpu + CUDA: grid_sampler_2d_backward_cuda + autogen: grid_sampler_2d_backward.out + +# See NOTE [ grid_sample CPU fallback ] +- func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + dispatch: + CompositeExplicitAutograd: _grid_sampler_2d_cpu_fallback + autogen: _grid_sampler_2d_cpu_fallback.out + +- func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) + +- func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + dispatch: + CPU: grid_sampler_3d_cpu + CUDA: grid_sampler_3d_cuda + autogen: grid_sampler_3d.out + +# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for +# the case where `input` doesn't require gradient. Gradient for `grid` is always +# computed (only `output_mask[0]` is checked by the implementations). +- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + CPU: grid_sampler_3d_backward_cpu + CUDA: grid_sampler_3d_backward_cuda + autogen: grid_sampler_3d_backward.out + +- func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hann_window + autogen: hann_window.out + +- func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hann_window + autogen: hann_window.periodic_out + +- func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.out + +- func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.periodic_out + +- func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.periodic_alpha_out + +- func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: hamming_window + autogen: hamming_window.periodic_alpha_beta_out + +- func: kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: kaiser_window + autogen: kaiser_window.out + +- func: kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: kaiser_window + autogen: kaiser_window.periodic_out + +- func: kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: kaiser_window + autogen: kaiser_window.beta_out + +- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor + +- func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor + +- func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, CUDA: native_group_norm + CompositeExplicitAutograd: math_group_norm + autogen: native_group_norm.out + tags: core + +- func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, CUDA: native_group_norm_backward + autogen: native_group_norm_backward.out + tags: core + +# Real to complex forward FFT +- func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor + variants: function + dispatch: + CPU: _fft_r2c_mkl + CUDA: _fft_r2c_cufft + +- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: _fft_r2c_mkl_out + CUDA: _fft_r2c_cufft_out + +# Complex to real inverse FFT +- func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor + variants: function + dispatch: + CPU: _fft_c2r_mkl + CUDA: _fft_c2r_cufft + +- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: _fft_c2r_mkl_out + CUDA: _fft_c2r_cufft_out + +# Standard complex to complex FFT (forward or backward) +- func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor + variants: function + dispatch: + CPU: _fft_c2c_mkl + CUDA: _fft_c2c_cufft + +- func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: _fft_c2c_mkl_out + CUDA: _fft_c2c_cufft_out + +- func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () + device_check: NoCheck + variants: function + dispatch: + CPU: _validate_compressed_sparse_indices_cpu + CUDA: _validate_compressed_sparse_indices_cuda + +- func: _cufft_get_plan_cache_size(int device_index) -> int + +- func: _cufft_get_plan_cache_max_size(int device_index) -> int + +- func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> () + +- func: _cufft_clear_plan_cache(int device_index) -> () + +- func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: index.Tensor_out + variants: function, method + dispatch: + QuantizedCPU: quantized_index + tags: dynamic_output_shape + # NB: This function is special-cased in tools/autograd/gen_variable_type.py + # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: + # - Tensor Tensor::index(ArrayRef indices) + # - Tensor Tensor::index(std::initializer_list indices) + +- func: index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True + structured_inherits: TensorIteratorBase + precomputed: + - indices -> DimVector sizes, DimVector strides + dispatch: + CPU, CUDA, MPS: index_out + +# Used by inductor to signal indexing without bounds checks +# Note that we don't support boolean indexing, to avoid dynamic output shapes +- func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor + variants: function + dispatch: + CPU, CUDA: _unsafe_index + +- func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: index_copy_out + +- func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) + variants: method + structured_delegate: index_copy.out + +- func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor + variants: function, method + structured_delegate: index_copy.out + +- func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!) + variants: method + +- func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor + variants: function, method + +- func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!) + device_check: NoCheck # delegate to _index_put_impl_, which leverages TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_put_ + autogen: index_put.out + # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: + # - Tensor & Tensor::index_put_(ArrayRef indices, Tensor const & rhs) + # - Tensor & Tensor::index_put_(ArrayRef indices, Scalar v) + # - Tensor & Tensor::index_put_(std::initializer_list indices, Tensor const & rhs) + # - Tensor & Tensor::index_put_(std::initializer_list indices, Scalar v) + +- func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor + device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_put + +- func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor + device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: _unsafe_index_put + +- func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA, MPS: _index_put_impl_ + QuantizedCPU: _index_put_impl_quantized_cpu_ + QuantizedCUDA: _index_put_impl_quantized_cuda_ + autogen: _index_put_impl, _index_put_impl.out + +- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor + variants: function + +- func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor + variants: function, method + +- func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Tensor_Tensor_out + +- func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Tensor_Tensor_out + +- func: isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Tensor_Scalar_out + +- func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Tensor_Scalar_out + +- func: isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Scalar_Tensor_out + +- func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Scalar_Tensor_out + +- func: isnan(Tensor self) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, MPS: isnan + SparseCPU, SparseCUDA: isnan_sparse + SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr + autogen: isnan.out + tags: [core, pointwise] + +- func: is_distributed(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + +- func: is_floating_point(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: is_complex(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: is_conj(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + +- func: _is_zerotensor(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + +- func: is_neg(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + +- func: isreal(Tensor self) -> Tensor + variants: function, method + +- func: is_nonzero(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + +- func: is_same_size(Tensor self, Tensor other) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + NestedTensorCPU, NestedTensorCUDA: nested_is_same_size + CompositeExplicitAutograd: is_same_size + +- func: is_signed(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: is_inference(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor + +- func: kron(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CompositeExplicitAutograd: kthvalue + +- func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU: kthvalue_out_cpu + CUDA: kthvalue_out_cuda + +- func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor + dispatch: + CompositeImplicitAutograd: layer_norm_symint + +- func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: layer_norm_cpu + CUDA: layer_norm_cuda + MPS: layer_norm_mps + CompositeExplicitAutograd: math_native_layer_norm + NestedTensorCPU, NestedTensorCUDA: nested_layer_norm + autogen: native_layer_norm.out + tags: core + +- func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: layer_norm_backward_cpu + CUDA: layer_norm_backward_cuda + MPS: layer_norm_backward_mps + NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested + autogen: native_layer_norm_backward.out + tags: core + +- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: nan_to_num + SparseCPU, SparseCUDA: nan_to_num_sparse + tags: pointwise + +- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: nan_to_num_ + SparseCPU, SparseCUDA: nan_to_num_sparse_ + tags: pointwise + +- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nan_to_num_out + MPS: nan_to_num_out_mps + SparseCPU, SparseCUDA: nan_to_num_sparse_out + tags: pointwise + +- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: linear + NestedTensorCPU, NestedTensorCUDA: nested_linear + MPS: _mps_linear + +- func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + NestedTensorCPU, NestedTensorCUDA: nested_linear_backward + MPS: mps_linear_backward + autogen: linear_backward.out + +- func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CompositeExplicitAutograd: linear_out + +- func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor + python_module: nn + dispatch: + MkldnnCPU: mkldnn_linear + autogen: mkldnn_linear.out + +- func: mkldnn_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor + dispatch: + MkldnnCPU: mkldnn_linear_backward_input + autogen: mkldnn_linear_backward_input.out + +- func: mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor) + dispatch: + MkldnnCPU: mkldnn_linear_backward_weights + autogen: mkldnn_linear_backward_weights.out + +- func: mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MkldnnCPU: mkldnn_linear_backward + autogen: mkldnn_linear_backward.out + +- func: _structured_sparse_linear(Tensor input, Tensor weight, Tensor mask_or_meta, *, Tensor? bias=None) -> (Tensor, Tensor) + dispatch: + CUDA: _structured_sparse_linear + +- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor + +- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor + +- func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) + +- func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor + +- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor + +- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor + +- func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor + +- func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor + +- func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: function, method + tags: pointwise + +- func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: linspace + +- func: linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: linspace_out + CUDA: linspace_cuda_out + MPS: linspace_out_mps + +- func: log(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log.out + variants: function, method + tags: [core, pointwise] + +- func: log_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log.out + variants: function, method + tags: pointwise + +- func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log_out + MPS: log_out_mps + tags: pointwise + +- func: log10(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log10.out + variants: function, method + tags: pointwise + +- func: log10_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log10.out + variants: function, method + tags: pointwise + +- func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log10_out + MPS: log10_out_mps + tags: pointwise + +- func: log1p(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log1p.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: log1p_sparse + SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr + tags: pointwise + +- func: log1p_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log1p.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: log1p_sparse_ + SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_ + tags: pointwise + +- func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log1p_out + MPS: log1p_out_mps + SparseCPU, SparseCUDA: log1p_sparse_out + SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out + tags: pointwise + +- func: log2(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: log2.out + variants: function, method + tags: pointwise + +- func: log2_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: log2.out + variants: function, method + tags: pointwise + +- func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: log2_out + MPS: log2_out_mps + tags: pointwise + +- func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: logaddexp_out + MPS: logaddexp_out_mps + tags: pointwise + +- func: logaddexp(Tensor self, Tensor other) -> Tensor + variants: method, function + structured_delegate: logaddexp.out + tags: pointwise + +- func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: logaddexp2_out + MPS: logaddexp2_out_mps + tags: pointwise + +- func: logaddexp2(Tensor self, Tensor other) -> Tensor + variants: method, function + structured_delegate: logaddexp2.out + tags: pointwise + +- func: xlogy.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: xlogy.OutTensor + variants: function, method + tags: pointwise + +- func: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: xlogy + tags: pointwise + +- func: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: xlogy + tags: pointwise + +# xlogy: inplace variant +- func: xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: xlogy.OutTensor + tags: pointwise + +- func: xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: xlogy_ + +# xlogy: out variant +- func: xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: xlogy_out + MPS: xlogy_out_mps + tags: pointwise + +- func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: xlogy_out + tags: pointwise + +- func: xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: xlogy_out + tags: pointwise + +- func: logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: logspace + +- func: logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: logspace_out + CUDA: logspace_cuda_out + +# log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. +- func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: log_softmax_out + +- func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor + structured_delegate: _log_softmax.out + tags: core + +- func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: log_softmax_cpu_out + CUDA: log_softmax_cuda_out + MPS: log_softmax_mps_out + +- func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor + structured_delegate: _log_softmax_backward_data.out + +- func: _log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: log_softmax_backward_cpu_out + CUDA: log_softmax_backward_cuda_out + MPS: log_softmax_backward_mps_out + +- func: _logcumsumexp(Tensor self, int dim) -> Tensor + dispatch: + CPU: _logcumsumexp_cpu + CUDA: _logcumsumexp_cuda + +- func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: _logcumsumexp_out_cpu + CUDA: _logcumsumexp_out_cuda + +- func: logcumsumexp(Tensor self, int dim) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: logcumsumexp + +- func: logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: logcumsumexp_out + +- func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor + variants: function, method + +- func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + +- func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: logsumexp + +- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + # calls squeeze + CompositeExplicitAutogradNonFunctional: logsumexp_out + +- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor + +- func: matmul(Tensor self, Tensor other) -> Tensor + variants: function, method + dispatch: + CompositeImplicitAutograd: matmul + NestedTensorCPU, NestedTensorCUDA: matmul_nested + +- func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor) + dispatch: + NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested + autogen: matmul_backward.out + +- func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeImplicitAutograd: matmul_out + NestedTensorCPU, NestedTensorCUDA: matmul_out_nested + +# Alias to linalg.matrix_power +- func: matrix_power(Tensor self, int n) -> Tensor + variants: function, method + +# Alias to linalg.matrix_power +- func: matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) + +# Alias to linalg.matrix_exp +- func: matrix_exp(Tensor self) -> Tensor + variants: function, method + +# This function should be deprecated in favor of differential_analytic_matrix_function in FunctionsManual.cpp +- func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor + +# DEPRECATED: Use torch.aminmax instead +- func: _aminmax(Tensor self) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: _aminmax_all + autogen: _aminmax.out + +# DEPRECATED: Use torch.aminmax instead +- func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: _aminmax + autogen: _aminmax.dim_out + +- func: aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max) + device_check: NoCheck # TensorIterator + structured_delegate: aminmax.out + variants: function, method + +- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: aminmax_out + MPS: aminmax_out_mps + +- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor + dispatch: + CPU, CUDA: _compute_linear_combination + +- func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: _compute_linear_combination_out + +- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + structured_delegate: max.dim_max + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: qmax + tags: core + +- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: max_out + MPS: max_out_mps + +- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: value_selecting_reduction_backward_symint + +- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor + variants: function, method + structured_delegate: amax.out + tags: core + +- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: amax_out + MPS: amax_out_mps + +# Return: (Tensor output, Tensor indices) +- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) + +- func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor + +- func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + CompositeImplicitAutograd: max_pool2d + MPS: mps_max_pool2d + +- func: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MPS: mps_max_pool2d_backward + autogen: max_pool2d_backward.out + +- func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool2d + autogen: mkldnn_max_pool2d.out + +- func: mkldnn_max_pool2d_backward(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool2d_backward + autogen: mkldnn_max_pool2d_backward.out + +- func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool3d + autogen: mkldnn_max_pool3d.out + +- func: mkldnn_max_pool3d_backward(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MkldnnCPU: mkldnn_max_pool3d_backward + autogen: mkldnn_max_pool3d_backward.out + +- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + QuantizedCPU: quantized_max_pool1d + autogen: quantized_max_pool1d.out + +- func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + QuantizedCPU: quantized_max_pool2d + QuantizedCUDA: quantized_max_pool2d_cudnn + autogen: quantized_max_pool2d.out + +- func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + QuantizedCPU: quantized_max_pool3d + autogen: quantized_max_pool3d.out + +- func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + +# The CPU and GPU dispatch variants are named weirdly here because otherwise there +# are namespacing issues in C++ +- func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: mean + +# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this. +# FIXME: fix CI jobs and re-enable this +#- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +# device_check: NoCheck # TensorIterator +# dispatch: +# CompositeExplicitAutograd: mean_dtype_out + +- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: mean.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + QuantizedCPU: mean_quantized_cpu + tags: core + +- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: mean_out + MPS: mean_out_mps + QuantizedCPU: mean_out_quantized_cpu + +- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # Composite + variants: function, method + +- func: nanmean.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # Composite + +- func: median(Tensor self) -> Tensor + variants: function, method + dispatch: + CPU: median_cpu + CUDA: median_cuda + MPS: median_mps + autogen: median.out + +- func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CompositeExplicitAutograd: median + +- func: median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU: median_out_cpu + CUDA: median_out_cuda + MPS: median_out_mps + +- func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: nanmedian(Tensor self) -> Tensor + variants: function, method + dispatch: + CPU: nanmedian_cpu + CUDA: nanmedian_cuda + autogen: nanmedian.out + +- func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CompositeExplicitAutograd: nanmedian + +- func: nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU: nanmedian_out_cpu + CUDA: nanmedian_out_cuda + +- func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + structured_delegate: min.dim_min + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: qmin + tags: core + +- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: min_out + MPS: min_out_mps + +- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: function, method + +- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + +- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor + variants: function, method + structured_delegate: amin.out + tags: core + +- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: amin_out + MPS: amin_out_mps + +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor + dispatch: + MPS: _mps_convolution + autogen: _mps_convolution.out + +- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MPS: mps_convolution_backward + autogen: mps_convolution_backward.out + +- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor + dispatch: + CompositeExplicitAutograd: mkldnn_convolution + autogen: mkldnn_convolution.out + +- func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: mkldnn_rnn_layer + autogen: mkldnn_rnn_layer.out + +- func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: mkldnn_rnn_layer_backward + autogen: mkldnn_rnn_layer_backward.out + +- func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: miopen_batch_norm + autogen: miopen_batch_norm.out + +- func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: miopen_batch_norm_backward + autogen: miopen_batch_norm_backward.out + +- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + dispatch: + CUDA: miopen_convolution + autogen: miopen_convolution.out + +- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + dispatch: + CUDA: miopen_convolution_transpose + autogen: miopen_convolution_transpose.out + +- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + dispatch: + CUDA: miopen_depthwise_convolution + autogen: miopen_depthwise_convolution.out + +- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor + dispatch: + CUDA: miopen_convolution_relu + +- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor + dispatch: + CUDA: miopen_convolution_add_relu + +- func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: miopen_rnn + autogen: miopen_rnn.out + tags: nondeterministic_seeded + + +- func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) + dispatch: + CUDA: miopen_rnn_backward + autogen: miopen_rnn_backward.out + +- func: mm(Tensor self, Tensor mat2) -> Tensor + structured_delegate: mm.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: _sparse_mm + SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm + tags: core + +- func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: mm_out_cpu + CUDA: mm_out_cuda + MPS: mm_out_mps + SparseCPU, SparseCUDA: _sparse_mm_out + SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out + +- func: _int_mm(Tensor self, Tensor mat2) -> Tensor + dispatch: + CUDA: _int_mm_cuda + +- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CUDA: _int_mm_out_cuda + +- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor + python_module: sparse + +- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor + python_module: sparse + +- func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor + dispatch: + SparseCPU: sparse_sparse_matmul_cpu + SparseCUDA: sparse_sparse_matmul_cuda + autogen: _sparse_sparse_matmul.out + +- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + dispatch: + CPU, CUDA: mode + +- func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CompositeExplicitAutograd: mode_out + +- func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) + variants: function, method + +- func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: mul.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: mul.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: mul_sparse + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr + MkldnnCPU: mkldnn_mul + ZeroTensor: mul_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor + tags: [core, pointwise] + +- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: mul.out + variants: method + dispatch: + SparseCPU, SparseCUDA: mul_sparse_ + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_ + MkldnnCPU: mkldnn_mul_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor + tags: pointwise + +- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: mul_out + MPS: mul_out_mps + SparseCPU: mul_out_sparse_cpu + SparseCUDA: mul_out_sparse_cuda + SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr + MkldnnCPU: mkldnn_mul_out + tags: pointwise + # For C++ only, until we have conversion from C++ numbers to Tensor + +- func: mul.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: mul + SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar + tags: [core, pointwise] + +- func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: mul_ + SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar + autogen: mul.Scalar_out + tags: pointwise +# multiply, alias for mul + +- func: multiply.Tensor(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: multiply.Scalar(Tensor self, Scalar other) -> Tensor + variants: function, method + +- func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: mv(Tensor self, Tensor vec) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: mv + SparseCPU, SparseCUDA: mv_sparse + +- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: mv_out + +- func: mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: mvlgamma_out + tags: pointwise + +- func: mvlgamma(Tensor self, int p) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: mvlgamma + tags: pointwise + +- func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: mvlgamma_ + tags: pointwise + +- func: narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor + variants: function, method + dispatch: + CPU: narrow_copy_dense_cpu + SparseCPU, SparseCUDA: narrow_copy_sparse + CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint + tags: view_copy + +- func: narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: narrow_copy_dense_cpu_out + +- func: narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: narrow_symint + +- func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: narrow_tensor_symint + +- func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: batch_norm_cpu + CUDA: batch_norm_cuda + MPS: batch_norm_mps + MkldnnCPU: mkldnn_batch_norm + tags: core + +- func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + dispatch: + CUDA: batch_norm_cuda_out + MPS: batch_norm_mps_out + CPU: batch_norm_cpu_out + +# TODO: In 2 weeks, we should make native_batch_norm composite implicit so that this correct schema percolates correctly through our dispatching +- func: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: _batch_norm_legit_cpu + CUDA: _batch_norm_legit_cuda + MPS: _batch_norm_legit_mps + MkldnnCPU: _mkldnn_batch_norm_legit + autogen: _native_batch_norm_legit_functional + +# HACK: identical to _native_batch_norm_legit, but training is known to be False, +# So we known that running stats will not be mutated. +# The real fix here is batch norm consolidation. +- func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _batch_norm_legit_no_training + autogen: _native_batch_norm_legit_no_training.out + +- func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!)) + dispatch: + CPU: _batch_norm_legit_cpu_out + CUDA: _batch_norm_legit_cuda_out + MPS: _batch_norm_legit_mps_out + +- func: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: _batch_norm_legit_no_stats_cpu + CUDA: _batch_norm_legit_no_stats_cuda + MPS: _batch_norm_legit_no_stats_mps + MkldnnCPU: _mkldnn_batch_norm_legit_no_stats + tags: core + +- func: _native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + dispatch: + CPU: _batch_norm_legit_no_stats_cpu_out + CUDA: _batch_norm_legit_no_stats_cuda_out + MPS: _batch_norm_legit_no_stats_mps_out + +- func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) + dispatch: + CUDA: batch_norm_stats_cuda + autogen: batch_norm_stats.out + +- func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor + dispatch: + CUDA: batch_norm_elemt_cuda + +- func: batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CUDA: batch_norm_elemt_cuda_out + +# for backward compatibility +- func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor) + dispatch: + CUDA: batch_norm_gather_stats_cuda + autogen: batch_norm_gather_stats.out + +- func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor) + dispatch: + CUDA: batch_norm_gather_stats_with_counts_cuda + autogen: batch_norm_gather_stats_with_counts.out + +- func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: batch_norm_backward_cpu + CUDA: batch_norm_backward_cuda + MPS: batch_norm_backward_mps + MkldnnCPU: mkldnn_batch_norm_backward + autogen: native_batch_norm_backward.out + +- func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: batch_norm_backward_reduce_cuda + autogen: batch_norm_backward_reduce.out + +- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu, Tensor count) -> Tensor + dispatch: + CUDA: batch_norm_backward_elemt_cuda + autogen: batch_norm_backward_elemt.out + +- func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor) + dispatch: + CPU: batch_norm_update_stats_cpu + CUDA: batch_norm_update_stats_cuda + autogen: batch_norm_update_stats.out + +- func: is_vulkan_available() -> bool + +- func: _nnpack_available() -> bool + +- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _nnpack_spatial_convolution + autogen: _nnpack_spatial_convolution.out + +- func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: ones + autogen: ones.names_out + +- func: ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: ones + +- func: ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: ones_out + +- func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: ones_like + NestedTensorCPU, NestedTensorCUDA: ones_like + autogen: ones_like.out + +- func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor + +- func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor + +- func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor + dispatch: + CompositeExplicitAutograd: _euclidean_dist + autogen: _euclidean_dist.out + +- func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor + dispatch: + CPU, CUDA: _cdist_forward + MPS: _cdist_forward_mps + autogen: _cdist_forward.out + +- func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor + dispatch: + CPU, CUDA: _cdist_backward + autogen: _cdist_backward.out + +- func: pdist(Tensor self, float p=2) -> Tensor + +- func: _pdist_forward(Tensor self, float p=2) -> Tensor + dispatch: + CPU, CUDA: _pdist_forward + autogen: _pdist_forward.out + +- func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor + dispatch: + CPU, CUDA: _pdist_backward + autogen: _pdist_backward.out + +- func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor + variants: function + +- func: permute(Tensor(a) self, int[] dims) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: permute + MPS: permute_mps + SparseCPU, SparseCUDA: permute_sparse_coo + tags: core + +- func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) + variants: function, method + +- func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a) + variants: function, method + +# moveaxis, alias for movedim +- func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) + variants: function, method + +- func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a) + variants: function, method + +# Only exposed from C++ -- in Python, +# we expose it as an attribute `T`, not a function. +# +# I'd like to name this "T" in C++ too, but +# calling a native function "T" causes undefined +# behavior on Windows, for reasons I don't understand +# (maybe related to capital letter collation somehow...) +- func: numpy_T(Tensor(a) self) -> Tensor(a) + variants: method + +# Exposed on Python as an attribute 'H' +- func: matrix_H(Tensor(a) self) -> Tensor(a) + variants: method + +# Exposed on Python as an attribute 'mT' +- func: mT(Tensor(a) self) -> Tensor(a) + variants: method + +# Exposed on Python as an attribute 'mH' +- func: mH(Tensor(a) self) -> Tensor(a) + variants: method + +- func: adjoint(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor + dispatch: + CPU: pixel_shuffle_cpu + CompositeExplicitAutogradNonFunctional: math_pixel_shuffle + autogen: pixel_shuffle.out + +- func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor + dispatch: + CPU: pixel_unshuffle_cpu + CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle + autogen: pixel_unshuffle.out + +- func: channel_shuffle(Tensor self, int groups) -> Tensor + dispatch: + CPU: channel_shuffle + QuantizedCPU: channel_shuffle_quantized_cpu + autogen: channel_shuffle.out + +- func: native_channel_shuffle(Tensor self, int groups) -> Tensor + dispatch: + CPU: channel_shuffle_cpu + CompositeImplicitAutograd: math_channel_shuffle + +- func: is_pinned(Tensor self, Device? device=None) -> bool + variants: method + dispatch: + CUDA: is_pinned_cuda + MPS: is_pinned_mps + CompositeExplicitAutograd: is_pinned_default + +# TODO: add a copy kwarg that guarantees that the tensor is put into fresh +# pinned memory +- func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a) + variants: method + +# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor +- func: _pin_memory(Tensor self, Device? device=None) -> Tensor + dispatch: + CUDA: _pin_memory_cuda + MPS: _pin_memory_mps + autogen: _pin_memory.out + +- func: pinverse(Tensor self, float rcond=1e-15) -> Tensor + variants: function, method + +- func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor + variants: function + +- func: rad2deg(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: rad2deg + SparseCPU, SparseCUDA: rad2deg_sparse + SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr + +- func: rad2deg_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: rad2deg_ + SparseCPU, SparseCUDA: rad2deg_sparse_ + SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_ + +- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: rad2deg_out + SparseCPU, SparseCUDA: rad2deg_sparse_out + SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_out + +- func: deg2rad(Tensor self) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: deg2rad + SparseCPU, SparseCUDA: deg2rad_sparse + SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr + tags: pointwise + +- func: deg2rad_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: deg2rad_ + SparseCPU, SparseCUDA: deg2rad_sparse_ + SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_ + tags: pointwise + +- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: deg2rad_out + SparseCPU, SparseCUDA: deg2rad_sparse_out + SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_out + tags: pointwise + +- func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: scalar_tensor + autogen: scalar_tensor.out + tags: core + +- func: rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: rand + autogen: rand.names_out + tags: nondeterministic_seeded + +- func: rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: rand + autogen: rand.generator_with_names_out + +- func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: [core, nondeterministic_seeded] + dispatch: + CompositeExplicitAutograd: rand + +- func: rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: rand + +- func: rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: rand_out + +- func: rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: rand_like + autogen: rand_like.out + +- func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint + +- func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randint_out + +- func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: randint_like + autogen: randint_like.out + +- func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd: randint_like + autogen: randint_like.low_dtype_out + +- func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: [core, nondeterministic_seeded] + dispatch: + CompositeExplicitAutograd: randn + +- func: randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randn + +- func: randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: randn + autogen: randn.names_out + +- func: randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: randn + autogen: randn.generator_with_names_out + +- func: randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + +- func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like + autogen: randn_like.out + +- func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randperm + +- func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randperm + +- func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: randperm_out + +- func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CPU: randperm_out_cpu + CUDA: randperm_out_cuda + MPS: randperm_out_mps + +- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: range + +- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: range + +- func: range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: range_out_no_step + +- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, Meta: range_out + CUDA: range_cuda_out + MPS: range_mps_out + cpp_no_default_args: ['step'] + +- func: ravel(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: reciprocal(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: reciprocal.out + variants: function, method + tags: [core, pointwise] + +- func: reciprocal_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: reciprocal.out + variants: function, method + tags: pointwise + +- func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: reciprocal_out + MPS: reciprocal_out_mps + tags: pointwise + +- func: neg(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: neg.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: neg_sparse + SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg + tags: [core, pointwise] + +- func: neg_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: neg.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: neg_sparse_ + SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_ + tags: pointwise + +- func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: neg_out + MPS: neg_out_mps + SparseCPU, SparseCUDA: neg_out_sparse + SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out + tags: pointwise +# Alias for neg + +- func: negative(Tensor self) -> Tensor + variants: function, method + +- func: negative_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: repeat(Tensor self, SymInt[] repeats) -> Tensor + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + dispatch: + CompositeExplicitAutograd: repeat + MPS: repeat_mps + autogen: repeat.out + tags: core + +- func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor + variants: function + dispatch: + CPU: repeat_interleave_cpu + CUDA: repeat_interleave_cuda + MPS: repeat_interleave_mps + tags: dynamic_output_shape + autogen: repeat_interleave.Tensor_out + +- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, int? output_size=None) -> Tensor + variants: function, method + +- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, int? output_size=None) -> Tensor + variants: function, method + dispatch: + CompositeImplicitAutograd: repeat_interleave_symint + +- func: reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: reshape_symint + CompositeImplicitAutogradNestedTensor: reshape_nested + +- func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _reshape_copy_symint + +# NOTE [ _reshape_alias ] is meant to be used in the implementation of reshape. +# They are not user-facing, hence the leading underscore. Please don't use it +# anywhere else. +- func: _reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias + # We don't need to support mkldnn since this is handled explicitly by the reshape operator. + +- func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: mkldnn_reshape + autogen: _mkldnn_reshape.out + +- func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: reshape_as + CompositeImplicitAutogradNestedTensor: reshape_as_nested + +- func: round(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: round.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: round_sparse + SparseCsrCPU, SparseCsrCUDA: round_sparse_csr + tags: pointwise + +- func: round_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: round.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: round_sparse_ + SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_ + tags: pointwise + +- func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU: round_out + CUDA: round_out + MPS: round_out_mps + SparseCPU, SparseCUDA: round_sparse_out + SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out + tags: pointwise + +- func: round.decimals(Tensor self, *, int decimals) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: round.decimals_out + variants: function, method + tags: pointwise + +- func: round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: round.decimals_out + variants: function, method + tags: pointwise + +- func: round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU: round_decimals_out + CUDA: round_decimals_out + tags: pointwise + +- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + +- func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) + tags: nondeterministic_seeded + device_check: NoCheck # TensorIterator + +- func: relu(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: relu + MPS: relu_mps + MkldnnCPU: mkldnn_relu + QuantizedCPU: relu_quantized_cpu + QuantizedCUDA: relu_quantized_cuda + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu + SparseCPU, SparseCUDA: relu_sparse + SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr + tags: [core, pointwise] + +- func: relu_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: relu_ + MPS: relu_mps_ + MkldnnCPU: mkldnn_relu_ + QuantizedCPU: relu_quantized_cpu_ + QuantizedCUDA: relu_quantized_cuda_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_ + SparseCPU, SparseCUDA: relu_sparse_ + SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr_ + autogen: relu.out + tags: pointwise + +- func: relu6(Tensor self) -> Tensor + python_module: nn + +- func: relu6_(Tensor(a!) self) -> Tensor(a!) + python_module: nn + +- func: prelu(Tensor self, Tensor weight) -> Tensor + variants: function, method + autogen: prelu.out + +- func: _prelu_kernel(Tensor self, Tensor weight) -> Tensor + dispatch: + CPU, CUDA: _prelu_kernel + QuantizedCPU: _prelu_kernel_quantized_cpu + MkldnnCPU: mkldnn_prelu + MPS: prelu_mps + +- func: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor) + dispatch: + CPU, CUDA: _prelu_kernel_backward + MkldnnCPU: mkldnn_prelu_backward + MPS: prelu_backward_mps + +- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU: gelu_out_cpu + CUDA: gelu_out_cuda + MPS: gelu_out_mps + +- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!) + structured_delegate: gelu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_ + +- func: gelu(Tensor self, *, str approximate='none') -> Tensor + structured_delegate: gelu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + MkldnnCPU: mkldnn_gelu + QuantizedCPU: gelu_quantized_cpu + QuantizedCUDA: gelu_quantized_cuda + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu + tags: [core, pointwise] + +- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU: gelu_backward_out_cpu + CUDA: gelu_backward_out_cuda + MPS: gelu_backward_out_mps + +- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor + structured_delegate: gelu_backward.grad_input + python_module: nn + dispatch: + MkldnnCPU: mkldnn_gelu_backward + NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested + tags: pointwise + +- func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor + variants: function + python_module: nn + device_check: NoCheck + device_guard: False + +- func: hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: hardshrink_out + +- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor + structured_delegate: hardshrink.out + device_check: NoCheck # TensorIterator + variants: function, method + +- func: hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: hardshrink_backward_out + +- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor + structured_delegate: hardshrink_backward.grad_input + variants: function, method + +- func: rsqrt(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: rsqrt.out + variants: function, method + tags: [core, pointwise] + +- func: rsqrt_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: rsqrt.out + variants: function, method + tags: pointwise + +- func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: rsqrt_out + MPS: rsqrt_out_mps + tags: pointwise + +- func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: select_symint + SparseCsrCPU, SparseCsrCUDA: select_sparse_csr + NestedTensorCPU, NestedTensorCUDA: select_nested + tags: core + +- func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: select_backward_symint + autogen: select_backward.out + +- func: _nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint + +- func: selu(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + +- func: selu_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: celu + +- func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: celu_ + autogen: celu.out + +- func: silu(Tensor self) -> Tensor + structured_delegate: silu.out + python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu + +- func: silu_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: silu.out + python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_ + +- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: silu_out + MPS: silu_out_mps + +- func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: silu_backward_out + MPS: silu_backward_out_mps + +- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor + structured_delegate: silu_backward.grad_input + python_module: nn + dispatch: + CompositeImplicitAutograd: math_silu_backward + NestedTensorCPU, NestedTensorCUDA: silu_backward_nested + +- func: mish(Tensor self) -> Tensor + structured_delegate: mish.out + python_module: nn + +- func: mish_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: mish.out + python_module: nn + +- func: mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: mish_out + +- func: mish_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: mish_backward + CompositeImplicitAutograd: math_mish_backward + +- func: sigmoid(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sigmoid.out + variants: function, method + dispatch: + QuantizedCPU: sigmoid_quantized_cpu + MkldnnCPU: mkldnn_sigmoid + tags: [core, pointwise] + +- func: sigmoid_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sigmoid.out + variants: function, method + dispatch: + MkldnnCPU: mkldnn_sigmoid_ + tags: pointwise + +- func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sigmoid_out + MPS: sigmoid_out_mps + tags: pointwise + +- func: logit(Tensor self, float? eps=None) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: logit + MPS: logit_mps + tags: pointwise + +- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!) + variants: function, method + dispatch: + CPU, CUDA: logit_ + tags: pointwise + +- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logit_out + MPS: logit_out_mps + tags: pointwise + +- func: sin(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sin.out + variants: function, method + dispatch: + SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr + SparseCPU, SparseCUDA: sin_sparse + tags: [core, pointwise] + +- func: sin_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sin.out + variants: function, method + dispatch: + SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_ + SparseCPU, SparseCUDA: sin_sparse_ + tags: pointwise + +- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sin_out + MPS: sin_out_mps + SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out + SparseCPU, SparseCUDA: sin_sparse_out + tags: pointwise + +- func: sinc(Tensor self) -> Tensor + structured_delegate: sinc.out + variants: function, method + tags: pointwise + +- func: sinc_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: sinc.out + variants: function, method + tags: pointwise + +- func: sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sinc_out + tags: pointwise + +- func: sinh(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sinh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sinh_sparse + SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr + tags: [core, pointwise] + +- func: sinh_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sinh.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sinh_sparse_ + SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_ + tags: pointwise + +- func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sinh_out + MPS: sinh_out_mps + SparseCPU, SparseCUDA: sinh_sparse_out + SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out + +# Returns a copy of this `Variable` that is detached from its autograd graph. +# This method is OK to call if the `Variable` is a view. +# +# NOTE: Previously, if we change the tensor metadata (e.g. sizes / strides / +# storage / storage_offset) of a tensor created from `detach()`, those metadata +# in the original tensor will also be updated. However, the new behavior is that +# those metadata changes to the detached tensor will not update the original tensor +# anymore, and in the `detach()` function we need to set `allow_tensor_metadata_change_` +# to false to make such changes explicitly illegal, in order to prevent users from +# changing metadata of the detached tensor and expecting the original tensor to also +# be updated. + tags: pointwise +- func: detach(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: detach + NestedTensorCPU, NestedTensorCUDA: detach + +# Like `detach()`, but modifies this `Variable` in-place. This method may +# only be called on non-view `Variable`s. You can use `is_view()` to check +# this. If this `Variable` is a view, throws an `std::runtime_error()`. +- func: detach_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + tags: inplace_view + dispatch: + CompositeExplicitAutograd: detach_ + +- func: size.int(Tensor self, int dim) -> int + variants: function + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: size.Dimname(Tensor self, Dimname dim) -> int + variants: function, method + device_check: NoCheck + device_guard: False + +- func: sym_size.int(Tensor self, int dim) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sym_numel(Tensor self) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sym_storage_offset(Tensor self) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: slice + tags: core + +# NOTE: The implementation of split_with_sizes bypasses the dispatcher to call this; undo +# that if adding specific implementations here! + +- func: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: slice_backward + autogen: slice_backward.out + +- func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: slice_scatter + autogen: slice_scatter.out + tags: core + +- func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: select_scatter_symint + autogen: select_scatter.out + +- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: diagonal_scatter + autogen: diagonal_scatter.out + +- func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint + autogen: as_strided_scatter.out + +- func: smm(Tensor self, Tensor mat2) -> Tensor + variants: function, method + +# softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. +- func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: softmax_out + +- func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + variants: function, method + +- func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor + structured_delegate: _softmax.out + dispatch: + MkldnnCPU: mkldnn_softmax + NestedTensorCPU, NestedTensorCUDA: softmax_nested + tags: core + +- func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: softmax_cpu_out + CUDA: softmax_cuda_out + MPS: softmax_mps_out + +- func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor + structured_delegate: _softmax_backward_data.out + dispatch: + NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward + +- func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + dispatch: + CPU: softmax_backward_cpu_out + CUDA: softmax_backward_cuda_out + MPS: softmax_backward_mps_out + +- func: unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: unsafe_split + autogen: unsafe_split.Tensor_out + +- func: split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: split + +- func: split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[] + variants: function, method + device_guard: False + dispatch: + CompositeImplicitAutograd: split_symint + +- func: unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: unsafe_split_with_sizes + autogen: unsafe_split_with_sizes.out + +- func: split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[] + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: split_with_sizes + NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested + +- func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] + variants: function, method + +- func: hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] + variants: function, method + +- func: vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] + variants: function, method + +- func: vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] + variants: function, method + +- func: dsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] + variants: function, method + +- func: dsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] + variants: function, method + +- func: squeeze(Tensor(a) self) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: squeeze + QuantizedCPU, QuantizedCUDA: squeeze_quantized + NestedTensorCPU, NestedTensorCUDA: squeeze_nested + +- func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: squeeze + QuantizedCPU, QuantizedCUDA: squeeze_quantized + NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested + tags: core + +- func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + + +- func: squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: squeeze + QuantizedCPU, QuantizedCUDA: squeeze_quantized + NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested + tags: core + +- func: squeeze_(Tensor(a!) self) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: squeeze_ + +- func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: squeeze_ + +- func: squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: squeeze_ + +- func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + +- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + +- func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: _sspaddmm_out_only_sparse + CUDA: _sspaddmm_out_only_sparse_cuda + SparseCPU: _sspaddmm_out_cpu + SparseCUDA: _sspaddmm_out_cuda + +- func: stack(Tensor[] tensors, int dim=0) -> Tensor + dispatch: + CompositeExplicitAutograd: stack + +- func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: stack_out + +- func: _stack(Tensor[] tensors, int dim=0) -> Tensor + dispatch: # match the backends supported by _cat + CPU: _stack_cpu + CompositeExplicitAutograd: _stack + +- func: _stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: # match the backends supported by _cat_out + CPU: _stack_out_cpu + CompositeExplicitAutograd: _stack_out + +- func: hstack(Tensor[] tensors) -> Tensor + +- func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: vstack(Tensor[] tensors) -> Tensor + +- func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: dstack(Tensor[] tensors) -> Tensor + +- func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +# Overload without center & pad mode, needed for forward-compatibility +- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + variants: function, method + cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized'] + +- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + variants: function, method + +- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor + variants: function, method + +- func: stride.int(Tensor self, int dim) -> int + variants: function + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + +- func: stride.Dimname(Tensor self, Dimname dim) -> int + variants: function, method + device_check: NoCheck + device_guard: False + +- func: sym_stride.int(Tensor self, int dim) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: sum + SparseCPU, SparseCUDA: sum_coo + SparseCsrCPU, SparseCsrCUDA: sum_csr + autogen: sum.out + +- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: sum.IntList_out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + NestedTensorCPU: NestedTensor_sum_dim_CPU + SparseCPU, SparseCUDA: sum_sparse_coo + tags: core + +- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: sum_out + MPS: sum_out_mps + +- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +# TODO: this function will be replaced once nested expand semantics have been settled on +- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor + dispatch: + NestedTensorCPU: _nested_sum_backward_cpu + +- func: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: nansum + MPS: nansum_mps + +- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nansum_out + MPS: nansum_out_mps + +- func: sum_to_size(Tensor self, SymInt[] size) -> Tensor + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: sum_to_size_symint + +- func: sqrt(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sqrt.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sqrt_sparse + SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr + tags: [core, pointwise] + +- func: sqrt_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sqrt.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sqrt_sparse_ + SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_ + tags: pointwise + +- func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sqrt_out + MPS: sqrt_out_mps + SparseCPU, SparseCUDA: sqrt_sparse_out + SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out + tags: pointwise + +- func: square(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: pointwise + +- func: square_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function, method + tags: pointwise + +- func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: std(Tensor self, bool unbiased=True) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: std + MPS: std_mps + QuantizedCPU: std_quantized_cpu + +- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: std_mean + autogen: std_mean.correction_out + +- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + +- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: std_out + QuantizedCPU: std_out_quantized_cpu + +- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + +- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: prod + MPS: prod_mps + autogen: prod.out + +- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: prod.int_out + device_check: NoCheck # TensorIterator + variants: function, method + +- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: prod_out + MPS: prod_out_mps + +- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: t(Tensor(a) self) -> Tensor(a) + device_check: NoCheck + device_guard: False + variants: function, method + dispatch: + CompositeExplicitAutograd: t + +- func: t_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck + device_guard: False + variants: method + tags: inplace_view + dispatch: + CompositeExplicitAutograd: t_ + +- func: tan(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: tan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: tan_sparse + SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr + tags: pointwise + +- func: tan_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: tan.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: tan_sparse_ + SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_ + tags: pointwise + +- func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: tan_out + MPS: tan_out_mps + SparseCPU, SparseCUDA: tan_sparse_out + SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out + tags: pointwise + +- func: tanh(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: tanh.out + variants: function, method + dispatch: + QuantizedCPU: tanh_quantized_cpu + MkldnnCPU: mkldnn_tanh + SparseCPU, SparseCUDA: tanh_sparse + SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh + tags: [core, pointwise] + +- func: tanh_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: tanh.out + variants: function, method + dispatch: + MkldnnCPU: mkldnn_tanh_ + SparseCPU, SparseCUDA: tanh_sparse_ + SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_ + tags: pointwise + +- func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: tanh_out + MPS: tanh_out_mps + SparseCPU, SparseCUDA: tanh_sparse_out + SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out + tags: pointwise + +- func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor + variants: function + +- func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU, CUDA: tensordot_out + +# TODO: namespace threshold in 'nn' +- func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + structured_delegate: threshold.out + dispatch: + QuantizedCPU: threshold_quantized_cpu + +- func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + structured_delegate: threshold.out + +- func: threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: threshold_out + MPS: threshold_out_mps + +- func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: threshold_backward_out + MPS: threshold_backward_out_mps + SparseCPU, SparseCUDA: threshold_backward_sparse_out + SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed_out + +- func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor + variants: function + structured_delegate: threshold_backward.grad_input + dispatch: + MkldnnCPU: mkldnn_relu_backward + SparseCPU, SparseCUDA: threshold_backward_sparse + SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed + NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested + tags: pointwise + +- func: tile(Tensor self, int[] dims) -> Tensor + variants: function, method + +- func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: transpose + NestedTensorCPU, NestedTensorCUDA: transpose_nested + +- func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: mkldnn_transpose + +- func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: transpose_ + +- func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + device_check: NoCheck + device_guard: False + dispatch: + MkldnnCPU: mkldnn_transpose_ + autogen: _mkldnn_transpose.out + +- func: one_hot(Tensor self, int num_classes=-1) -> Tensor + python_module: nn + variants: function + tags: dynamic_output_shape + +- func: flip(Tensor self, int[] dims) -> Tensor + variants: function, method + dispatch: + CPU, QuantizedCPU, CUDA, QuantizedCUDA: flip + MPS: flip_mps + autogen: flip.out + tags: core + +- func: fliplr(Tensor self) -> Tensor + variants: function, method + +- func: flipud(Tensor self) -> Tensor + variants: function, method + +- func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor + variants: function, method + dispatch: + CPU, MPS: roll + CUDA: roll_cuda + autogen: roll.out + +# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args + +- func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: rot90 + autogen: rot90.out + +- func: trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor + +- func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor + +# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads). +- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu + CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda + autogen: _transform_bias_rescale_qkv.out + +- func: _nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor + dispatch: + CPU, CUDA: NestedTensor_nested_tensor_from_mask + autogen: _nested_tensor_from_mask.out + +- func: _nested_tensor_from_mask_left_aligned(Tensor t, Tensor mask) -> bool + dispatch: + CPU, CUDA: NestedTensor_nested_tensor_from_mask_left_aligned + +- func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor + device_check: NoCheck # cpu_nested_shape_example will always be on CPU + dispatch: + CPU: nested_from_padded_generic + CUDA: nested_from_padded_cuda + autogen: _nested_from_padded.out + +# These private functions are temporary. They will be updated/deleted when nested tensors switch to using SymInts for their metadata representation +- func: _nested_tensor_size(Tensor self) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size + autogen: _nested_tensor_size.out + +- func: _nested_tensor_strides(Tensor self) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides + autogen: _nested_tensor_strides.out + +- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets + autogen: _nested_tensor_storage_offsets.out + +# _nested_from_padded is not usable from Python, so +# _nested_from_padded_and_nested_example is available for testing. +- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example + autogen: _nested_from_padded_and_nested_example.out + +# The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation +# this will need to be updated +- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: + CPU, CUDA: _nested_view_from_buffer + +- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy + autogen: _nested_view_from_buffer_copy.out + +- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor + dispatch: + # calls unsqueeze + CompositeExplicitAutogradNonFunctional: _trilinear + autogen: _trilinear.out + +- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor + +- func: trunc(Tensor self) -> Tensor + structured_delegate: trunc.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: trunc_sparse + SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr + tags: pointwise + +- func: trunc_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: trunc.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: trunc_sparse_ + SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_ + tags: pointwise + +- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: trunc_out + MPS: trunc_out_mps + SparseCPU, SparseCUDA: trunc_sparse_out + SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out + tags: pointwise +# Alias for trunc + +- func: fix(Tensor self) -> Tensor + variants: function, method + +- func: fix_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + +- func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: type_as(Tensor self, Tensor other) -> Tensor + variants: method + +- func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool + variants: function + +- func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: _unique_cpu + CUDA: _unique_cuda + autogen: _unique.out + +- func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: unique_dim_cpu + CUDA: unique_dim_cuda + tags: dynamic_output_shape + autogen: unique_dim.out + +- func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: unique_consecutive_cpu + CUDA: unique_consecutive_cuda + MPS: unique_consecutive_mps + tags: dynamic_output_shape + autogen: unique_consecutive.out + +- func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: unique_dim_consecutive_cpu + CUDA: unique_dim_consecutive_cuda + MPS: unique_dim_consecutive_mps + tags: dynamic_output_shape + autogen: unique_dim_consecutive.out + +# _unique and _unique_dim are fragile and modifying them easily cause internal break +# the below operator is a temporary hack for adding return_counts support +# Please don't rely on these two operators, they will be removed soon + +- func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU: _unique2_cpu + CUDA: _unique2_cuda + MPS: _unique2_mps + tags: dynamic_output_shape + autogen: _unique2.out + +- func: _unsafe_view(Tensor self, SymInt[] size) -> Tensor + dispatch: + CompositeExplicitAutograd: _unsafe_view + autogen: _unsafe_view.out + +- func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: unsqueeze + SparseCPU, SparseCUDA: unsqueeze_sparse + QuantizedCPU, QuantizedCUDA: unsqueeze_quantized + NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested + tags: core + +- func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + dispatch: + CompositeExplicitAutograd: unsqueeze_ + +- func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor + +- func: var(Tensor self, bool unbiased=True) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + tags: core + cpp_no_default_args: ["unbiased"] + +- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: var + MPS: var_mps + +- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: var_out + +- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + cpp_no_default_args: ["unbiased"] + +- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + cpp_no_default_args: ["unbiased"] + +- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + +- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: var_mean + autogen: var_mean.correction_out + +- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + cpp_no_default_args: ["unbiased"] + +- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) + device_check: NoCheck # TensorIterator + variants: function + +- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CPU, CUDA: where + MPS: where_mps + tags: [core, pointwise] + +- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: where_self_out + MPS: where_self_out_mps + +- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor + variants: function + +- func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor + variants: function, method + +- func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor + variants: function + +- func: where(Tensor condition) -> Tensor[] + device_check: NoCheck # TensorIterator + variants: function + +- func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor + variants: function + +# VariableType::_weight_norm does not want to be given a gap in the autograd graph, +# so we don't define "dispatch" variants for it. +- func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor + variants: function + +- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: weight_norm_cpu + CUDA: weight_norm_cuda + autogen: _weight_norm_interface.out + +- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: weight_norm_backward_cpu + CUDA: weight_norm_backward_cuda + autogen: _weight_norm_interface_backward.out + +- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) + variants: function + +- func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: zeros + autogen: zeros.names_out + +- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: _efficientzerotensor + CUDA: _efficientzerotensor_cuda + Meta: _efficientzerotensor_meta + autogen: _efficientzerotensor.out + +- func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: zeros_symint + +- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: zeros_out + SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out + +- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + # NB: Although this composite mutates on the inside, it is + # non-differentiable so NonFunctional doesn't apply + CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like + autogen: zeros_like.out + +- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor + variants: function + dispatch: + CPU: _standard_gamma_grad_cpu + CUDA: _standard_gamma_grad_cuda + autogen: _standard_gamma_grad.out + +- func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor + variants: function + dispatch: + CPU: _s_gamma_cpu + CUDA: _s_gamma_cuda + tags: nondeterministic_seeded + autogen: _standard_gamma.out + +- func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor + dispatch: + CPU: _dirichlet_grad_cpu + CUDA: _dirichlet_grad_cuda + autogen: _dirichlet_grad.out + +- func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor + tags: nondeterministic_seeded + variants: function + dispatch: + CPU: _s_dirichlet_cpu + CUDA: _s_dirichlet_cuda + autogen: _sample_dirichlet.out + +- func: poisson(Tensor self, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + dispatch: + CPU: _s_poisson_cpu + CUDA: _s_poisson_cuda + tags: nondeterministic_seeded + autogen: poisson.out + +- func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + dispatch: + CPU: _s_binomial_cpu + CUDA: _s_binomial_cuda + tags: nondeterministic_seeded + autogen: binomial.out + +# When more variants get ported to native, this dispatch will get more +# complicated + +- func: native_norm(Tensor self, Scalar p=2) -> Tensor + dispatch: + SparseCPU, SparseCUDA: norm_sparse + autogen: native_norm.out + +- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor + dispatch: + SparseCPU, SparseCUDA: norm_sparse + autogen: native_norm.ScalarOpt_dim_dtype_out + +# TODO: reduce signatures down to one when optional args is available +- func: _sparse_sum(Tensor self) -> Tensor + +- func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor + +- func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor + dispatch: + CompositeExplicitAutograd: _sparse_sum + autogen: _sparse_sum.dim_out + +- func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor + +- func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor + dispatch: + SparseCPU: _sparse_sum_backward_cpu + SparseCUDA: _sparse_sum_backward_cuda + autogen: _sparse_sum_backward.out + +- func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_sum_cpu + SparseCsrCUDA: _sparse_csr_sum_cuda + autogen: _sparse_csr_sum.dim_dtype_out + +- func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_prod_cpu + SparseCsrCUDA: _sparse_csr_prod_cuda + autogen: _sparse_csr_prod.dim_dtype_out + +- func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor + python_module: sparse + dispatch: + SparseCPU: softmax_sparse_cpu + SparseCUDA: softmax_sparse_cuda + autogen: _sparse_softmax.out + +- func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + dispatch: + SparseCPU: softmax_backward_sparse_cpu + SparseCUDA: softmax_backward_sparse_cuda + autogen: _sparse_softmax_backward_data.out + +- func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor + python_module: sparse + variants: function + +- func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor + python_module: sparse + dispatch: + SparseCPU: log_softmax_sparse_cpu + SparseCUDA: log_softmax_sparse_cuda + autogen: _sparse_log_softmax.out + +- func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + dispatch: + SparseCPU: log_softmax_backward_sparse_cpu + SparseCUDA: log_softmax_backward_sparse_cuda + autogen: _sparse_log_softmax_backward_data.out + +- func: _spdiags(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None) -> Tensor + python_module: sparse + dispatch: + CPU: spdiags + autogen: _spdiags.out + +- func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: norm + autogen: norm.ScalarOpt_dtype_out + +- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: norm + autogen: norm.Scalar_out + +- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + structured_delegate: norm.dtype_out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sparse_dtype_norm + +- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor + structured_delegate: norm.out + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sparse_norm + +- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: norm_dtype_out + MPS: norm_dtype_out_mps + +- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: norm_out + MPS: norm_out_mps + +# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd +- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent) + variants: method, function + dispatch: + CompositeExplicitAutograd: frexp + tags: pointwise + +- func: frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) + dispatch: + CPU, CUDA: frexp_out + tags: pointwise + +# Deprecated (v.1.12) +- func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor + variants: function + +# Deprecated (v.1.12) +- func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor + variants: function + +# Deprecated (v.1.12) +- func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + variants: function + +- func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: clone + SparseCPU, SparseCUDA: clone_sparse + SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed + MkldnnCPU: mkldnn_clone + QuantizedCPU, QuantizedCUDA: quantized_clone + NestedTensorCPU, NestedTensorCUDA: clone_nested + autogen: clone.out + tags: core + +- func: positive(Tensor(a) self) -> Tensor(a) + variants: function, method + tags: pointwise + +- func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function, method + dispatch: + CompositeExplicitAutograd: resize_as_ + autogen: resize_as, resize_as.out + tags: inplace_view + +- func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function, method + dispatch: + SparseCPU, SparseCUDA: resize_as_sparse_ + SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_compressed_ + autogen: resize_as_sparse, resize_as_sparse.out + +- func: zero_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: zero_ + MPS: zero_mps_ + Meta: zero_meta_ + SparseCPU, SparseCUDA, SparseMeta: zero_sparse_ + SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_ + MkldnnCPU: mkldnn_zero_ + NestedTensorCPU, NestedTensorCUDA: zero_nested_ + autogen: zero, zero.out + +- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sub_out + MPS: sub_out_mps + SparseCPU, SparseCUDA: sub_out_sparse + tags: pointwise + +- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: sub.out + dispatch: + SparseCPU, SparseCUDA: sub_sparse + ZeroTensor: sub_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor + tags: [core, pointwise] + +- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: sub.out + dispatch: + SparseCPU, SparseCUDA: sub_sparse_ + tags: pointwise +# For C++ only, until we have conversion from C++ numbers to Tensor + +- func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: sub + tags: [core, pointwise] + +- func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: sub_ + autogen: sub.Scalar_out + tags: pointwise +# subtract, alias for sub + +- func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + +- func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + variants: function, method + +- func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + variants: method + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + variants: function, method + +- func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + variants: method + +- func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: rsub + autogen: rsub.Tensor_out + +- func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: heaviside_out + tags: pointwise + +- func: heaviside(Tensor self, Tensor values) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: heaviside.out + tags: pointwise + +- func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: heaviside.out + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: rsub + autogen: rsub.Scalar_out + +# Functionally the same as addmm, but we give it a different derivative formula +# that doesn't propagate gradients to non-present entries on sparse. + tags: pointwise +- func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + python_module: sparse + dispatch: + CompositeExplicitAutograd: _sparse_addmm + autogen: _sparse_addmm.out + +- func: sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + python_module: sparse + dispatch: + SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu + +- func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + python_module: sparse + dispatch: + SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu + +- func: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor) + python_module: sparse + dispatch: + SparseCsrCPU: _sparse_mm_reduce_impl_sparse_csr_cpu + +- func: _sparse_mm_reduce_impl_backward(Tensor self, Tensor grad_out, Tensor weight, str reduce, Tensor arg_out, bool[2] output_mask) -> (Tensor, Tensor) + python_module: sparse + dispatch: + SparseCsrCPU: _sparse_mm_reduce_impl_backward_sparse_csr_cpu + +- func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmm_out_cpu + CUDA: addmm_out_cuda + MPS: addmm_out_mps + SparseCPU: addmm_out_sparse_dense_cpu + SparseCUDA: addmm_out_sparse_dense_cuda + SparseCsrCPU: addmm_out_sparse_compressed_cpu + SparseCsrCUDA: addmm_out_sparse_compressed_cuda + +- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + structured_delegate: addmm.out + variants: function, method + dispatch: + SparseCPU: addmm_sparse_dense_cpu + SparseCUDA: addmm_sparse_dense_cuda + SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense + tags: core + +- func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + structured_delegate: addmm.out + variants: method + dispatch: + # Warning! For whatever reason, the inplace sparse addmm is NON + # broadcasting + SparseCPU: s_addmm_sparse_dense_cpu_ + SparseCUDA: s_addmm_sparse_dense_cuda_ + +- func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmm_activation_out_cpu + CUDA: addmm_activation_out_cuda + +- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor + structured_delegate: _addmm_activation.out + variants: function, method + +# NOTE [ Sparse: autograd and API ] +# +# +# Sparse Tensor Constructors +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The API entry points to sparse tensor construction should be +# `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`. Depending on whether the +# indices and values tensors are given, they eventually dispatch to either +# `sparse_coo_tensor_with_dims` or `sparse_coo_tensor_with_dims_and_tensors`. +# +# The autograd support for ctor is implement on `sparse_coo_tensor_with_dims_and_tensors`. +# +# The API methods `sparse_coo tensor` and `_sparse_coo_tensor_unsafe` +# **must not** have specific type dispatches because otherwise codegen will +# consider them as abstract methods (see Note [Abstract ATen methods]), dispatch +# using **Tensor** type, and thus lose autograd tracking on the actual method +# they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`. +# +# +# Sparse Methods API Design +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Goals: 1. Flexible API for users to write custom sparse ops +# 2. ctor and member accessor with autograd support +# +# To achieve 1, we need to provide a set of *dangerous* APIs (dangerous in the +# sense that misusing them will break sparse tensor invariant and may out in +# unexpected behavior, e.g., crash). These methods are all prefixed with +# underscore "_" to indicate that they should be used with care. We provide: +# +# + `_indices()`: returns the *raw* indices within the sparse tensor (not just +# sharing storage). Any inplace operation will change the +# actual indices, including t_, set_, as_strided_, resize_, +# etc. +# + `_values()`: returns the *raw* values within the sparse tensor. Similar +# semantics as `_indices()` +# + `_nnz()`: returns the number of non-zero entries. This will always be +# determined by the shapes of indices and values. +# + `_coalesced_(bool)`: inplace sets whether the tensor is coalesced, and +# returns itself. +# +# These methods are very useful in writing new operations, e.g., a custom +# autograd Function. +# +# We also provide other public *safe* APIs: +# + `indices()`: returns a **view** of the indices tensor if the sparse tensor +# is **coalesced**. +# + `values()`: returns a **view** of the values tensor if the containing +# sparse tensor is **coalesced**. +# + `sparse_dim()`: number of sparse dimensions +# + `dense_dim()`: number of dense dimensions +# + `is_coalesced()`: whether the sparse tensor is coalesced +# +# `_indices()` and `_values()` should returns the raw indices and values dense +# tensors within a sparse tensor. They can be quite unsafe with inplace +# operations like `t_()`, and exposes uncoalesced indices and values. The public +# recommended API is `indices()` and `values()`, both of which first check that +# the tensor is coalesced and return views on those tensors. +# +# +# Autograd Support +# ~~~~~~~~~~~~~~~~ +# +# Autograd is supported on `values()` and sparse tensor ctor with indices and +# values tensors. E.g., `torch.sparse_coo_tensor(i, v).values().sum()` is +# differentiable w.r.t. `v`. +# +# NB: The `values()` and `_values()` operators are special in that they are +# layout-aware, i.e., the output depends not just on the data it represents, but +# also on the input layout details (in this case, the `indices` tensor). See +# NOTE [ as_strided Backward and layout-aware/agnostic autograd ] in Functions.cpp +# for discussion on layout-aware vs layout-agnostic autograd. Since PyTorch ops +# operate in the layout-agnostic mode, similar to `as_strided`, backward of +# these two operators need to consider them in a layout-agnostic way: +# + `values()`: +# Input is coalesced. +# We just pretend having `input.indices()` as an additional argument +# `input_indices`, then forward is similar to +# `input.to(kStrided).index_select(input_indices)` regardless of the layout. +# Note that `values()` normally is layout-aware even if we constrain +# ourselves on sparse inputs since it may include all zeros values entries +# as "present" entries. +# + `_values()`: +# Input may be uncoalesced. +# It is not straightforward to construct a layout-agnostic version because +# duplicate indices entries may exist and additional parameterization is +# needed to distribute the value into different values entries. Furthermore, +# this op is intended to provide ways to write custom sparse ops, rather +# than being used in autograd graph, so it is marked as *non-differentiable* +# in derivatives.yaml. +# +# Before reading the following, see NOTE [ Autograd Variable Views ] in +# variable.h for details on views that are tracked by autograd, and views that +# are not. +# +# Moreover, these methods return tensors that share storage with inputs, so we +# mark these methods as view ops to support autograd history tracking. +# The sparse tensor ctor output should technically be view of both input indices +# and values tensors, but currently we only support setting as view of a single +# Variable, so it is only view of the values tensor. +# TODO: clone indices in sparse tensor ctor. +# +# For other methods that return outputs that share storage with inputs, i.e., +# `indices()` and `_indices()`. We mark their outputs as non-differentiable, so +# the view relation is not tracked by autograd, but the version counter is still +# shared. In other words, their outputs are non-differentiable views of the +# sparse tensor. +# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given +# the default would never make sense. + +- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + +- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + +- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + +- func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_coo_tensor + autogen: sparse_coo_tensor.size_out + +- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + +- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + +- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint + +- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () + +- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> () +- func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () + +- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse + autogen: _sparse_coo_tensor_with_dims.out + +- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint + autogen: _sparse_coo_tensor_with_dims_and_tensors.out + +- func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: sparse_resize_ + autogen: sparse_resize, sparse_resize.out + +- func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: sparse_resize_and_clear_ + autogen: sparse_resize_and_clear, sparse_resize_and_clear.out + +- func: sparse_mask(Tensor self, Tensor mask) -> Tensor + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_mask + SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr + autogen: sparse_mask.out + +- func: _sparse_mask_projection(Tensor self, Tensor mask) -> Tensor + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_mask_projection + autogen: _sparse_mask_projection.out + +- func: _to_cpu(Tensor[] tensors) -> Tensor[] + variants: function + +- func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor + variants: method + +# Special case of to_dense with custom derivative +- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_to_dense + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense + MkldnnCPU: mkldnn_to_dense + autogen: _to_dense.out + +- func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor + +- func: sparse_dim(Tensor self) -> int + variants: method + dispatch: + CPU, CUDA: sparse_dim_strided + SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_dim_sparse_csr + device_check: NoCheck + device_guard: False + +# legacy method +- func: _dimI(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_dim_sparse + device_check: NoCheck + device_guard: False + +- func: dense_dim(Tensor self) -> int + variants: method + dispatch: + CPU, CUDA: dense_dim_strided + SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse + SparseCsrCPU, SparseCsrCUDA: dense_dim_sparse_csr + device_check: NoCheck + device_guard: False + +# legacy method +- func: _dimV(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse + device_check: NoCheck + device_guard: False + +- func: _nnz(Tensor self) -> int + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse + SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr + device_check: NoCheck + device_guard: False + +# NOTE: [ coalesce autograd ] +# coalesce returns self directly for already coalesced sparse tensors. +# This means coalesce cannot have a derivative registered, otherwise it creates +# circular references in the autograd graph (see gh-52874). +# Instead, the derivative is registered on the slow-path "_coalesce" +- func: coalesce(Tensor(a) self) -> Tensor(a) + variants: method + +- func: _coalesce(Tensor self) -> Tensor + dispatch: + SparseCPU: _coalesce_sparse_cpu + SparseCUDA: _coalesce_sparse_cuda + autogen: _coalesce.out + +- func: is_coalesced(Tensor self) -> bool + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: is_coalesced_sparse + CompositeExplicitAutograd: is_coalesced_default + device_check: NoCheck + device_guard: False + +- func: _indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _indices_sparse + device_check: NoCheck + device_guard: False + +- func: _values(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _values_sparse + device_check: NoCheck + device_guard: False + +# This method doesn't do any check but only directly sets the flag. So it can be +# a bit unsafe. Similar to _indices and _values, this is useful for implementing +# custom sparse operations in Python/C++ extension. +- func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: _coalesced_sparse_ + device_check: NoCheck + device_guard: False + autogen: _coalesced, _coalesced.out + +- func: indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: indices_sparse + CompositeExplicitAutograd: indices_default + device_check: NoCheck + device_guard: False + +- func: values(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCPU, SparseCUDA, SparseMeta: values_sparse + SparseCsrCPU, SparseCsrCUDA: values_sparse_csr + NestedTensorCPU, NestedTensorCUDA: values_nested + CompositeExplicitAutograd: values_default + device_check: NoCheck + device_guard: False + +- func: crow_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr + CompositeExplicitAutograd: crow_indices_default + device_check: NoCheck + device_guard: False + +- func: col_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr + CompositeExplicitAutograd: col_indices_default + device_check: NoCheck + device_guard: False + +- func: ccol_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr + CompositeExplicitAutograd: ccol_indices_default + device_check: NoCheck + device_guard: False + +- func: row_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr + CompositeExplicitAutograd: row_indices_default + device_check: NoCheck + device_guard: False + +- func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + SparseCPU: hspmm_out_sparse_cpu + SparseCUDA: hspmm_out_sparse_cuda + +- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor + dispatch: + SparseCPU: hspmm_sparse_cpu + SparseCUDA: hspmm_sparse_cuda + +- func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) + device_check: NoCheck # Allows copy into different device + variants: function + dispatch: + SparseCPU, SparseCUDA: copy_sparse_ + autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out + +# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors +- func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[] + variants: function, method + dispatch: + CompositeExplicitAutograd: unbind + CompositeImplicitAutogradNestedTensor: NestedTensor_unbind + +- func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[] + variants: function, method + +- func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse + SparseCPU, SparseCUDA: sparse_coo_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse + autogen: to_sparse.sparse_dim_out + +- func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse + SparseCPU, SparseCUDA: sparse_coo_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse + autogen: to_sparse.out + +- func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csr + SparseCPU, SparseCUDA: coo_to_sparse_csr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr + autogen: to_sparse_csr.out + +- func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csc + SparseCPU, SparseCUDA: coo_to_sparse_csc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc + autogen: to_sparse_csc.out + +- func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsr + SparseCPU, SparseCUDA: coo_to_sparse_bsr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr + autogen: to_sparse_bsr.out + +- func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsc + SparseCPU, SparseCUDA: coo_to_sparse_bsc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc + autogen: to_sparse_bsc.out + +- func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor + variants: method + dispatch: + CPU: dense_to_mkldnn + autogen: to_mkldnn.out + +- func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1, int[]? input_size=None) -> Tensor + variants: function + python_module: nn + dispatch: + MkldnnCPU: mkldnn_reorder_conv2d_weight + autogen: mkldnn_reorder_conv2d_weight.out + +- func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor + variants: function + python_module: nn + dispatch: + MkldnnCPU: mkldnn_reorder_conv3d_weight + autogen: mkldnn_reorder_conv3d_weight.out + +- func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor + +- func: quantize_per_tensor_dynamic(Tensor self, ScalarType dtype, bool reduce_range) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_tensor_dynamic + autogen: quantize_per_tensor_dynamic.out + +- func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_tensor + autogen: quantize_per_tensor.out + +- func: quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_tensor_tensor_qparams + autogen: quantize_per_tensor.tensor_qparams_out + +- func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[] + variants: function + dispatch: + CPU: quantize_per_tensor_list_cpu + autogen: quantize_per_tensor.tensors_out + +- func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_channel + autogen: quantize_per_channel.out + +- func: dequantize.self(Tensor self) -> Tensor + variants: function, method + dispatch: + CPU, CUDA: dequantize_cpu_or_cuda + QuantizedCPU, QuantizedCUDA: dequantize_quantized + autogen: dequantize.self_out + +- func: dequantize.tensors(Tensor[] tensors) -> Tensor[] + variants: function + dispatch: + QuantizedCPU: dequantize_tensors_quantized_cpu + autogen: dequantize.tensors_out + +- func: q_scale(Tensor self) -> float + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_scale_quant + +- func: q_zero_point(Tensor self) -> int + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_zero_point_quant + +- func: q_per_channel_scales(Tensor self) -> Tensor + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_per_channel_scales + autogen: q_per_channel_scales.out + +- func: q_per_channel_zero_points(Tensor self) -> Tensor + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points + autogen: q_per_channel_zero_points.out + +- func: q_per_channel_axis(Tensor self) -> int + variants: function, method + dispatch: + QuantizedCPU, QuantizedCUDA: q_per_channel_axis + +- func: int_repr(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + QuantizedCPU: int_repr_quantized_cpu + QuantizedCUDA: int_repr_quantized_cuda + autogen: int_repr.out + +- func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor + dispatch: + CPU: make_per_tensor_quantized_tensor_cpu + CUDA: make_per_tensor_quantized_tensor_cuda + autogen: _make_per_tensor_quantized_tensor.out + +- func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor + dispatch: + CPU: make_per_channel_quantized_tensor_cpu + CUDA: make_per_channel_quantized_tensor_cuda + autogen: _make_per_channel_quantized_tensor.out + +- func: qscheme(Tensor self) -> QScheme + variants: method + dispatch: + QuantizedCPU, QuantizedCUDA: qscheme_quant + +- func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + +- func: fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + +- func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: fake_quantize_per_tensor_affine_cachemask + autogen: fake_quantize_per_tensor_affine_cachemask.out + +- func: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams + autogen: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out + +- func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor + variants: function + +- func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine + autogen: _fake_quantize_learnable_per_tensor_affine.out + +- func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine_backward + +- func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + +- func: fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: fake_quantize_per_channel_affine_cachemask + autogen: fake_quantize_per_channel_affine_cachemask.out + +- func: fake_quantize_per_channel_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor + variants: function + +- func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine + autogen: _fake_quantize_learnable_per_channel_affine.out + +- func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor) + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine_backward + +- func: fused_moving_avg_obs_fake_quant(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> Tensor + variants: function + +- func: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask) + dispatch: + CPU: fused_moving_avg_obs_fake_quant_cpu + CUDA: fused_moving_avg_obs_fake_quant_cuda + autogen: _fused_moving_avg_obs_fq_helper_functional, _fused_moving_avg_obs_fq_helper.out + +- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) + variants: function + +- func: _saturate_weight_to_fp16(Tensor weight) -> Tensor + variants: function + +- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor) + variants: function + +- func: _autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a) + variants: method + device_guard: False + +- func: _autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a) + variants: method + device_guard: False + +- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: _to_copy + NestedTensorCPU, NestedTensorCUDA: _to_copy_nested + autogen: _to_copy.out + tags: core + +# to(Device) must not exist because all constructors of Device also works for +# TensorOptions. Otherwise, an ambiguity error is thrown. +# See NOTE [ TensorOptions Constructors ]. +- func: to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + +- func: meshgrid(Tensor[] tensors) -> Tensor[] + +# TODO: Two weeks after this lands, combine these two overloads, +# making "indexing" optional. These are temporarily distinct for +# forward-compatibility reasons. +- func: meshgrid.indexing(Tensor[] tensors, *, str indexing) -> Tensor[] + +- func: cartesian_prod(Tensor[] tensors) -> Tensor + variants: function + +- func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor + variants: function + +- func: item(Tensor self) -> Scalar + tags: data_dependent_output + variants: method + +- func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType + variants: function + +- func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType + variants: function + +- func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType + variants: function + +- func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType + +- func: can_cast(ScalarType from, ScalarType to) -> bool + variants: function + +- func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType + variants: function + +# NB: Does NOT check precondition that numel == 1 +- func: _local_scalar_dense(Tensor self) -> Scalar + tags: data_dependent_output + dispatch: + CPU: _local_scalar_dense_cpu + CUDA: _local_scalar_dense_cuda + MPS: _local_scalar_dense_mps + variants: function + +# MPS LSTM implementation + +- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + MPS: _lstm_mps + autogen: _lstm_mps.out + tags: nondeterministic_seeded + +- func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[]) + dispatch: + MPS: lstm_mps_backward + autogen: lstm_mps_backward.out + + +# Fused RNN kernels +- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_lstm_cell_cuda + autogen: _thnn_fused_lstm_cell.out + +# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs +# It is necessary to avoid triggering TensorImpl use count checks in debug mode +# NB: this is function is NOT differentiable +- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_lstm_cell_backward_impl_cuda + autogen: _thnn_fused_lstm_cell_backward_impl.out + +- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + +- func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + +- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_gru_cell_cuda + autogen: _thnn_fused_gru_cell.out + +- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: _thnn_fused_gru_cell_backward_cuda + autogen: _thnn_fused_gru_cell_backward.out + +- func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + +# RNN cells and layers +- func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) + tags: nondeterministic_seeded + +- func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor) + tags: nondeterministic_seeded + +- func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded + +- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor) + +- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + +- func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + +- func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + +# Quantized RNN layer registration has been moved to C10 dispatch in `RNN.cpp` + +# Quantized RNN layers +# - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) + + +# - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) + + +# Quantized GRU layers + +# - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) +# + +# - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) +# + +# Quantized RNN cells +- func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor) + +- func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor + +- func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor + +- func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor + +# PackedSequence utilities +- func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _pack_padded_sequence + autogen: _pack_padded_sequence.out + +- func: _pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor + dispatch: + CompositeImplicitAutograd: _pack_padded_sequence_backward_symint + +- func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor) + +# wrappers for legacy TH methods + +- func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, MPS: set_ + autogen: set.source_Storage, set.source_Storage_out + +- func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU: set_storage_cpu_ + Meta: set_storage_meta__symint + CUDA: set_storage_cuda_ + MPS: set_storage_mps_ + QuantizedCPU, QuantizedCUDA: set_storage_quantized_ + autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out + +- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: set__symint + +- func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, MPS: set_tensor_ + autogen: set.source_Tensor, set.source_Tensor_out + +- func: set_(Tensor(a!) self) -> Tensor(a!) + variants: method + dispatch: + CPU: set_cpu_ + CUDA: set_cuda_ + Meta: set_meta_ + MPS: set_mps_ + autogen: set, set.out + +# Not making it CompositeImplicitAutograd because lift +# should be a primitive w.r.t. functorch + +# TODO: this should have a view annotation +# TODO: shouldn't be a method +- func: lift(Tensor self) -> Tensor + dispatch: + CompositeExplicitAutograd: lift + autogen: lift.out + +# lift_fresh is called with an argument that is guaranteed to be +# fresh (i.e., newly allocated). This is ONLY called from a +# torch.tensor call; if you FX trace a lift_fresh, you are obligated +# to convert this into a lift_fresh_copy (because FX will violate the +# freshness invariant when tracing). +- func: lift_fresh(Tensor(a) self) -> Tensor(a) + dispatch: + CompositeExplicitAutograd: lift_fresh + +# Like lift, but it clones the input. +- func: lift_fresh_copy(Tensor self) -> Tensor + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: lift_fresh_copy + autogen: lift_fresh_copy.out + +- func: is_set_to(Tensor self, Tensor tensor) -> bool + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, MPS: is_set_to + +- func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU: masked_fill__cpu + CUDA: masked_fill__cuda + QuantizedCPU: masked_fill__quantized_cpu + QuantizedCUDA: masked_fill__quantized_cuda + MPS: masked_fill__mps + autogen: masked_fill.Scalar_out + +- func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: masked_fill + NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill + tags: pointwise + +- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU: masked_fill__cpu + CUDA: masked_fill__cuda + QuantizedCPU: masked_fill__quantized_cpu + QuantizedCUDA: masked_fill__quantized_cuda + MPS: masked_fill__mps + autogen: masked_fill.Tensor_out + +- func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: masked_fill + +- func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!) + variants: method + dispatch: + CPU: masked_scatter__cpu + CUDA: masked_scatter__cuda + MPS: masked_scatter__mps + autogen: masked_scatter.out + +- func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: masked_scatter + +- func: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor + dispatch: + CUDA: masked_softmax_cuda + CPU: masked_softmax_cpu + autogen: _masked_softmax.out + +- func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor + dispatch: + CUDA: masked_softmax_backward_cuda + CPU: masked_softmax_backward_cpu + autogen: _masked_softmax_backward.out + +- func: view(Tensor(a) self, SymInt[] size) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view + MkldnnCPU: mkldnn_view + NestedTensorCPU, NestedTensorCUDA: view_nested + tags: core + +# Warning: If you want to change the name or overload name of this +# operator, you might also want to change the `isBlockListedSchema` +# function in `torch/csrc/jit/frontend/schema_catching.cpp`. +# The name and overload name of this operator is hardcoded in that +# function in order to workaround a bug: +# https://github.com/pytorch/pytorch/issues/47964 +- func: view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: view_dtype + +- func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!) + variants: method + dispatch: + CPU, CUDA: put_ + autogen: put.out + +- func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: put + +- func: index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU: index_add_cpu_out + CUDA: index_add_cuda_out + MPS: index_add_mps_out + +- func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!) + structured_delegate: index_add.out + variants: method + +- func: index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor + structured_delegate: index_add.out + variants: function, method + +- func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor + variants: function, method + +- func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU: index_reduce_cpu_out + CUDA: index_reduce_cuda_out + +- func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: index_reduce.out + variants: method + +- func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: index_reduce.out + variants: function, method + +- func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU: index_fill_ + CUDA: index_fill_ + MPS: index_fill_mps_ + autogen: index_fill.int_Scalar_out + +- func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_fill + +- func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: index_fill_ + MPS: index_fill_mps_ + autogen: index_fill.int_Tensor_out + +- func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + dispatch: + CompositeExplicitAutograd: index_fill + +- func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + +- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor + structured_delegate: scatter.src_out + variants: function, method + +- func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + structured_delegate: scatter.src_out + variants: method + +- func: scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_src_out + MPS: scatter_src_out_mps + +- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor + structured_delegate: scatter.value_out + variants: function, method + +- func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) + structured_delegate: scatter.value_out + variants: method + +- func: scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_value_out + MPS: scatter_value_out_mps + +- func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor + structured_delegate: scatter.reduce_out + variants: function, method + +- func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!) + structured_delegate: scatter.reduce_out + variants: method + +- func: scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_reduce_out + MPS: scatter_reduce_out_mps + +- func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor + structured_delegate: scatter.value_reduce_out + variants: function, method + +- func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!) + structured_delegate: scatter.value_reduce_out + variants: method + +- func: scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_value_reduce_out + MPS: scatter_value_reduce_out_mps + +- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor + variants: function, method + +- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor + variants: function, method + +- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor + structured_delegate: scatter_add.out + variants: function, method + tags: core + +- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + structured_delegate: scatter_add.out + variants: method + +- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_add + MPS: scatter_add_mps_out + +- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor + variants: function, method + +- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: scatter_reduce.two_out + variants: function, method + tags: core + +- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: scatter_reduce.two_out + variants: method + +- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_reduce_two + +- func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: eq.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: eq.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +- func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: bitwise_and_out + MPS: bitwise_and_out_mps + tags: pointwise + +- func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_and_out + tags: pointwise + +- func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_and + tags: pointwise + +- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_and + autogen: bitwise_and.Scalar_Tensor_out + tags: pointwise + +- func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: bitwise_and.Tensor_out + tags: [core, pointwise] + +- func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_and.Tensor_out + tags: pointwise + +- func: __and__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __and__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: bitwise_or_out + MPS: bitwise_or_out_mps + tags: pointwise + +- func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_or_out + tags: pointwise + +- func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_or + autogen: bitwise_or.Scalar_Tensor_out + tags: pointwise + +- func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: bitwise_or.Tensor_out + tags: [core, pointwise] + +- func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_or.Tensor_out + tags: pointwise + +- func: __or__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __or__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + +- func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + variants: function + dispatch: + CPU, CUDA: bitwise_xor_out + MPS: bitwise_xor_out_mps + tags: pointwise + +- func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_xor_out + tags: pointwise + +- func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_xor + autogen: bitwise_xor.Scalar_Tensor_out + tags: pointwise + +- func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: bitwise_xor.Tensor_out + tags: [core, pointwise] + +- func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_xor.Tensor_out + tags: pointwise + +- func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __lshift__ + tags: pointwise + +- func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __lshift__ + tags: pointwise + +- func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __ilshift__ + autogen: __lshift__.Scalar_out + tags: pointwise + +- func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __ilshift__ + autogen: __lshift__.Tensor_out + tags: pointwise + +- func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: bitwise_left_shift.Tensor_out + tags: pointwise + +- func: bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_left_shift.Tensor_out + tags: pointwise + +- func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_left_shift_out + tags: pointwise + +- func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_left_shift + tags: pointwise + +- func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: bitwise_left_shift_ + tags: pointwise + +- func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_left_shift_out + tags: pointwise + +- func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_left_shift + autogen: bitwise_left_shift.Scalar_Tensor_out + tags: pointwise + +- func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __rshift__ + tags: pointwise + +- func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: __rshift__ + tags: pointwise + +- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __irshift__ + autogen: __rshift__.Scalar_out + +- func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: __irshift__ + autogen: __rshift__.Tensor_out + +- func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: bitwise_right_shift.Tensor_out + tags: pointwise + +- func: bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_right_shift.Tensor_out + tags: pointwise + +- func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_right_shift_out + tags: pointwise + +- func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_right_shift + tags: pointwise + +- func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: bitwise_right_shift_ + tags: pointwise + +- func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_right_shift_out + tags: pointwise + +- func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_right_shift + autogen: bitwise_right_shift.Scalar_Tensor_out + tags: pointwise + +- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) + structured_delegate: tril.out + variants: method + +- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) + structured_delegate: triu.out + variants: method + +- func: digamma_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: digamma.out + variants: method + tags: pointwise + +- func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: lerp.Scalar_out + tags: pointwise + +- func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: lerp.Tensor_out + tags: pointwise + +- func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + variants: method + dispatch: + CPU, CUDA: addbmm_ + MPS: addbmm_mps_ + +- func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addbmm_out + MPS: addbmm_out_mps + +- func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: addbmm + MPS: addbmm_mps + +- func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: random_ + Meta: random_meta_ + MPS: random_mps_ + autogen: random.from, random.from_out + +- func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: random_ + Meta: random_meta_ + MPS: random_mps_ + autogen: random.to, random.to_out + +- func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: random_ + MPS: random_mps_ + Meta: random_meta_ + autogen: random, random.out + +- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: uniform_ + MPS: uniform_mps_ + Meta: uniform_meta_ + autogen: uniform, uniform.out + +- func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: cauchy_ + autogen: cauchy, cauchy.out + +- func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: log_normal_ + autogen: log_normal, log_normal.out + +- func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: exponential_ + MPS: exponential_mps_ + autogen: exponential, exponential.out + +- func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: geometric_ + + # wrappers for TH functions + autogen: geometric, geometric.out + +- func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: diag(Tensor self, int diagonal=0) -> Tensor + variants: method, function + +- func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + +- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor + variants: method, function + +- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: triu_cpu + CUDA: triu_cuda + MPS: triu_mps_out + +- func: triu(Tensor self, int diagonal=0) -> Tensor + structured_delegate: triu.out + variants: method, function + +- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: tril_cpu + CUDA: tril_cuda + MPS: tril_mps_out + +- func: tril(Tensor self, int diagonal=0) -> Tensor + structured_delegate: tril.out + variants: method, function + +- func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: tril_indices_cpu + CUDA: tril_indices_cuda + autogen: tril_indices.out + +- func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CPU: triu_indices_cpu + CUDA: triu_indices_cuda + autogen: triu_indices.out + +- func: trace(Tensor self) -> Tensor + variants: method, function + dispatch: + CPU: trace_cpu + CUDA: trace_cuda + MPS: trace_mps + autogen: trace.out + +- func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: trace_backward_symint + +- func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ne_Scalar_out + MPS: ne_scalar_out_mps + QuantizedCPU: ne_out_quantized_cpu + tags: pointwise + +- func: ne.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: ne.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ne_quantized_cpu + tags: [core, pointwise] + +- func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ne_Tensor_out + MPS: ne_tensor_out_mps + QuantizedCPU: ne_out_quantized_cpu + tags: pointwise + +- func: ne.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: ne.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ne_quantized_cpu + tags: [core, pointwise] + +- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: ne.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: ne.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# not_equal, alias for torch.ne +- func: not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: eq_Scalar_out + MPS: eq_scalar_out_mps + QuantizedCPU: eq_out_quantized_cpu + tags: pointwise + +- func: eq.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: eq.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: eq_quantized_cpu + tags: [core, pointwise] + +- func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: eq_Tensor_out + MPS: eq_tensor_out_mps + QuantizedCPU: eq_out_quantized_cpu + tags: pointwise + +- func: eq.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: eq.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: eq_quantized_cpu + tags: [core, pointwise] + +- func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ge_Scalar_out + MPS: ge_scalar_out_mps + QuantizedCPU: ge_out_quantized_cpu + tags: pointwise + +- func: ge.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: ge.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ge_quantized_cpu + tags: [core, pointwise] + +- func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: ge_Tensor_out + MPS: ge_tensor_out_mps + QuantizedCPU: ge_out_quantized_cpu + tags: pointwise + +- func: ge.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: ge.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: ge_quantized_cpu + tags: [core, pointwise] + +- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: ge.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: ge.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# greater_equal, alias for torch.ge +- func: greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: le_Scalar_out + MPS: le_scalar_out_mps + QuantizedCPU: le_out_quantized_cpu + tags: pointwise + +- func: le.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: le.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: le_quantized_cpu + tags: [core, pointwise] + +- func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: le_Tensor_out + MPS: le_tensor_out_mps + QuantizedCPU: le_out_quantized_cpu + tags: pointwise + +- func: le.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: le.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: le_quantized_cpu + tags: [core, pointwise] + +- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: le.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: le.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# less_equal, alias for torch.le +- func: less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: gt_Scalar_out + MPS: gt_scalar_out_mps + QuantizedCPU: gt_out_quantized_cpu + tags: pointwise + +- func: gt.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: gt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: gt_quantized_cpu + tags: [core, pointwise] + +- func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: gt_Tensor_out + MPS: gt_tensor_out_mps + QuantizedCPU: gt_out_quantized_cpu + tags: pointwise + +- func: gt.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: gt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: gt_quantized_cpu + tags: [core, pointwise] + +- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: gt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: gt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# greater, alias for torch.gt +- func: greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: lt_Scalar_out + MPS: lt_scalar_out_mps + QuantizedCPU: lt_out_quantized_cpu + tags: pointwise + +- func: lt.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: lt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: lt_quantized_cpu + tags: [core, pointwise] + +- func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: lt_Tensor_out + MPS: lt_tensor_out_mps + QuantizedCPU: lt_out_quantized_cpu + tags: pointwise + +- func: lt.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: lt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + QuantizedCPU: lt_quantized_cpu + tags: [core, pointwise] + +- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: lt.Scalar_out + device_check: NoCheck # TensorIterator + variants: method + +- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: lt.Tensor_out + device_check: NoCheck # TensorIterator + variants: method + +# less, alias for torch.lt +- func: less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + +- func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less.Tensor(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + +- func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: take_out + +- func: take(Tensor self, Tensor index) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: take + +- func: take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + +- func: take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor + variants: method, function + +- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, QuantizedCPU: index_select_out_cpu_ + CUDA, QuantizedCUDA: index_select_out_cuda + MPS: index_select_out_mps + +- func: index_select(Tensor self, int dim, Tensor index) -> Tensor + variants: method, function + dispatch: + CPU: index_select_cpu_ + QuantizedCPU: index_select_quantized_cpu_ + CUDA: index_select_cuda + QuantizedCUDA: index_select_quantized_cuda + SparseCPU: index_select_sparse_cpu + SparseCUDA: index_select_sparse_cuda + MPS: index_select_mps + tags: core + +- func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) + +- func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor + variants: method, function + +- func: index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + dispatch: + CompositeImplicitAutograd: index_select_backward_symint + +- func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: masked_select_out_cpu + CUDA: masked_select_out_cuda + MPS: masked_select_out_mps + tags: dynamic_output_shape + +- func: masked_select(Tensor self, Tensor mask) -> Tensor + variants: method, function + dispatch: + CPU: masked_select_cpu + CUDA: masked_select_cuda + MPS: masked_select_mps + tags: dynamic_output_shape + +- func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: nonzero_out_cpu + CUDA: nonzero_out_cuda + MPS: nonzero_out_mps + tags: dynamic_output_shape + +- func: nonzero(Tensor self) -> Tensor + variants: method, function + dispatch: + CPU: nonzero_cpu + CUDA: nonzero_cuda + MPS: nonzero_mps + tags: [dynamic_output_shape, core] + +- func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: nonzero_static_out_cpu + +- func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor + variants: method, function + dispatch: + CPU: nonzero_static_cpu + +- func: nonzero_numpy(Tensor self) -> Tensor[] + variants: method, function + +- func: argwhere(Tensor self) -> Tensor + variants: method, function + tags: dynamic_output_shape + +- func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU, CUDA: gather_out + MPS: gather_out_mps + +- func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor + variants: method, function + structured_delegate: gather.out + tags: core + +- func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor + variants: function + device_check: NoCheck + device_guard: False + +- func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) + +- func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor + variants: method, function + +- func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor + +- func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: addcmul_out + MPS: addcmul_out_mps + tags: pointwise + +- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor + structured_delegate: addcmul.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + structured_delegate: addcmul.out + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: addcdiv_out + MPS: addcdiv_out_mps + tags: pointwise + +- func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor + structured_delegate: addcdiv.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + structured_delegate: addcdiv.out + device_check: NoCheck # TensorIterator + variants: method + tags: pointwise + +- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: cross_entropy_loss_symint + +- func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient) + structured: True + dispatch: + CPU, CUDA: triangular_solve_out + MPS: triangular_solve_mps_out + SparseCsrCPU: triangular_solve_out_sparse_csr_cpu + SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda + +- func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient) + structured_delegate: triangular_solve.X + variants: method, function + +- func: _linalg_check_errors(Tensor info, str api_name, *, bool is_matrix) -> () + dispatch: + CompositeExplicitAutograd: _linalg_check_errors + +- func: linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + dispatch: + CPU, CUDA: linalg_solve_triangular_out + MPS: linalg_solve_triangular_mps_out + +- func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_solve_triangular + MPS: linalg_solve_triangular_mps + +- func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor + python_module: linalg + dispatch: + CompositeImplicitAutograd: linalg_vander_symint + +- func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) + +- func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V) + variants: method, function + +# swapaxes, alias for transpose +- func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + +# swapdims, alias for transpose +- func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + +- func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + tags: inplace_view + +- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cholesky_out + +- func: cholesky(Tensor self, bool upper=False) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: cholesky + +- func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: cholesky_solve_out + +- func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor + variants: method, function + dispatch: + CompositeExplicitAutograd: cholesky_solve + +- func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor + variants: function + dispatch: + CPU: _cholesky_solve_helper_cpu + CUDA: _cholesky_solve_helper_cuda + autogen: _cholesky_solve_helper.out + +- func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: cholesky_inverse + +- func: cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cholesky_inverse_out + +- func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) + +- func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R) + variants: method, function + +- func: geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau) + dispatch: + CPU, CUDA: geqrf_out + +- func: geqrf(Tensor self) -> (Tensor a, Tensor tau) + variants: method, function + dispatch: + CPU, CUDA: geqrf + +# orgqr, alias for linalg_householder_product +- func: orgqr(Tensor self, Tensor input2) -> Tensor + variants: method, function + +- func: orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!) + +- func: ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: ormqr_out + +- func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: ormqr + +- func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info) + variants: function + +- func: lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!) + +- func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor + variants: method, function + +# lu_unpack +- func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U) + structured_delegate: lu_unpack.out + variants: function + +- func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) + variants: function + structured: True + dispatch: + CPU, CUDA: lu_unpack_out + +# TODO: remove dispatch section when porting TH CUDA to ATen +- func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: multinomial_out + MPS: multinomial_out_mps + +- func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor + variants: method, function + dispatch: + CPU, CUDA: multinomial + MPS: multinomial_mps + tags: nondeterministic_seeded + +- func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lgamma_out + tags: pointwise + +- func: lgamma_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: lgamma.out + variants: method + tags: pointwise + +- func: lgamma(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: lgamma.out + variants: method, function + tags: pointwise + +- func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: digamma_out + tags: pointwise + +- func: digamma(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: digamma.out + variants: method, function + tags: pointwise + +- func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: polygamma_out + tags: pointwise + +- func: polygamma(int n, Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: polygamma.out + variants: method, function + tags: pointwise + +- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: polygamma_ + tags: pointwise + +- func: erfinv(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: erfinv.out + variants: method, function + dispatch: + SparseCPU, SparseCUDA: erfinv_sparse + SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr + tags: pointwise + +- func: erfinv_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: erfinv.out + variants: method + dispatch: + SparseCPU, SparseCUDA: erfinv_sparse_ + SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_ + tags: pointwise + +- func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: erfinv_out + SparseCPU, SparseCUDA: erfinv_sparse_out + SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out + tags: pointwise + +- func: i0(Tensor self) -> Tensor + structured_delegate: i0.out + variants: function, method + tags: pointwise + +- func: i0_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: i0.out + variants: function, method + tags: pointwise + +- func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: i0_out + tags: pointwise + +- func: sign(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: sign.out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: sign_sparse + SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr + tags: [core, pointwise] + +- func: sign_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: sign.out + variants: method + dispatch: + SparseCPU, SparseCUDA: sign_sparse_ + SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_ + tags: pointwise + +- func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sign_out + MPS: sign_out_mps + SparseCPU, SparseCUDA: sign_sparse_out + SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out + tags: pointwise + +- func: signbit(Tensor self) -> Tensor + variants: function, method + structured_delegate: signbit.out + dispatch: + SparseCPU, SparseCUDA: signbit_sparse + SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr + tags: pointwise + +- func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU: signbit_out + CUDA: signbit_out + MPS: signbit_out_mps + SparseCPU, SparseCUDA: signbit_sparse_out + SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out + tags: pointwise + +- func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: dist + autogen: dist.out + +- func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: atan2_out + MPS: atan2_mps_out + tags: pointwise + +- func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: atan2.out + variants: method + tags: pointwise + +- func: atan2(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: atan2.out + variants: method, function + tags: pointwise +# arctan2, alias of atan2 + +- func: arctan2(Tensor self, Tensor other) -> Tensor + variants: method, function + +- func: arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + +- func: arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lerp_Scalar + tags: pointwise + +- func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: lerp_Tensor + tags: pointwise + +- func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: lerp.Scalar_out + tags: pointwise + +- func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: lerp.Tensor_out + tags: pointwise + +- func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, MPS: histogram_histc_out + CUDA: _histc_out_cuda + +- func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor + variants: method, function + dispatch: + CPU, MPS: histogram_histc + CUDA: _histc_cuda + +- func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) + dispatch: + CPU, MPS: histogram_out + +- func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) + variants: method, function + dispatch: + CPU, MPS: histogram + +- func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) + dispatch: + CPU, MPS: histogram_out + +- func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) + variants: method, function + dispatch: + CPU, MPS: histogram + +- func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[] + dispatch: + CPU, MPS: histogramdd_bin_edges + autogen: _histogramdd_bin_edges.out + +- func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor + dispatch: + CPU, MPS: _histogramdd + autogen: _histogramdd_from_bin_cts.out + +- func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor + dispatch: + CPU, MPS: _histogramdd + autogen: _histogramdd_from_bin_tensors.out + +- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: fmod_out + tags: pointwise + +- func: fmod.Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: fmod + tags: pointwise + +- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: fmod_ + tags: pointwise + +- func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: fmod_out + MPS: fmod_mps_out + tags: pointwise + +- func: fmod.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: fmod.Tensor_out + variants: method, function + tags: [core, pointwise] + +- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: fmod.Tensor_out + tags: pointwise + +- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: hypot_out + MPS: hypot_out_mps + tags: pointwise + +- func: hypot(Tensor self, Tensor other) -> Tensor + structured_delegate: hypot.out + variants: method, function + tags: pointwise + +- func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: hypot.out + variants: method + tags: pointwise + +- func: igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: igamma_out + tags: pointwise + +- func: igamma(Tensor self, Tensor other) -> Tensor + structured_delegate: igamma.out + variants: method, function + tags: pointwise + +- func: igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: igamma.out + variants: method + tags: pointwise + +- func: igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: igammac_out + tags: pointwise + +- func: igammac(Tensor self, Tensor other) -> Tensor + structured_delegate: igammac.out + variants: method, function + tags: pointwise + +- func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: igammac.out + variants: method + tags: pointwise + +- func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: nextafter_out + tags: pointwise + +- func: nextafter(Tensor self, Tensor other) -> Tensor + structured_delegate: nextafter.out + variants: method, function + tags: pointwise + +- func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: nextafter.out + variants: method + tags: pointwise + +- func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: remainder_out + tags: pointwise + +- func: remainder.Scalar(Tensor self, Scalar other) -> Tensor + variants: method, function + dispatch: + CompositeExplicitAutograd: remainder + tags: pointwise + +- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: remainder_ + tags: pointwise + +- func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: remainder_out + MPS: remainder_out_mps + tags: pointwise + +- func: remainder.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: remainder.Tensor_out + variants: method, function + tags: [core, pointwise] + +- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: remainder.Tensor_out + variants: method + tags: pointwise + +- func: remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA, MPS: remainder + autogen: remainder.Scalar_Tensor_out + tags: pointwise + +- func: min(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: min + MPS: min_mps + QuantizedCPU: min_quantized_cpu + +- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: min_unary_out + QuantizedCPU: min_quantized_unary_out + +- func: fmin(Tensor self, Tensor other) -> Tensor + structured_delegate: fmin.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA, MPS: fmin_out + tags: pointwise + +- func: max(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: max + MPS: max_mps + QuantizedCPU: max_quantized_cpu + +- func: fmax(Tensor self, Tensor other) -> Tensor + structured_delegate: fmax.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA, MPS: fmax_out + tags: pointwise + +- func: maximum(Tensor self, Tensor other) -> Tensor + structured_delegate: maximum.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: [core, pointwise] + +- func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: maximum_out + MPS: maximum_out_mps + tags: pointwise + +# binary max, alias of maximum +# NOTE: max is not an alias for maximum, since there is also unary max +- func: max.other(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: pointwise + +- func: max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: max_unary_out + QuantizedCPU: max_quantized_unary_out + +- func: minimum(Tensor self, Tensor other) -> Tensor + structured_delegate: minimum.out + device_check: NoCheck # TensorIterator + variants: method, function + tags: [core, pointwise] + +- func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: minimum_out + MPS: minimum_out_mps + tags: pointwise + +# binary min, alias for minimum +# NOTE: min is not an alias for minimum, since there is also unary min +- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: pointwise + +- func: min.other(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + tags: pointwise + +- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor + variants: method, function + +- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) + +- func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: sort_out + +- func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + structured: True + dispatch: + CPU, CUDA: sort_stable_out + MPS: sort_stable_out_mps + +- func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CompositeExplicitAutograd: sort + +- func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) + structured_delegate: sort.values_stable + variants: method, function + dispatch: + QuantizedCPU: sort_quantized_cpu_stable + +- func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + +- func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices) + variants: method, function + +- func: sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices) + variants: method, function + +- func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: msort(Tensor self) -> Tensor + variants: method, function + +- func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + +- func: argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA, MPS: argsort_stable + autogen: argsort.stable_out + +- func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor + variants: method, function + +- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + structured: True + dispatch: + CPU: topk_out_cpu + CUDA: topk_out_cuda + MPS: topk_out_mps + +- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) + variants: method, function + structured_delegate: topk.values + dispatch: + QuantizedCPU: topk_quantized_cpu + tags: core + +- func: all(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: all.all_out + variants: method, function + +- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True + dispatch: + CPU, CUDA: all_all_out + MPS: all_all_out_mps + +- func: any(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: any.all_out + variants: method, function + dispatch: + SparseCPU, SparseCUDA: any_sparse + +- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True + dispatch: + CPU, CUDA: any_all_out + MPS: any_all_out_mps + +- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: renorm_out + +- func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + structured_delegate: renorm.out + +- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: renorm.out + +- func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) + variants: method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, MPS: unfold + QuantizedCPU, QuantizedCUDA: unfold + +- func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor + variants: function + dispatch: + CPU, CUDA: unfold_backward + autogen: unfold_backward.out + +- func: equal(Tensor self, Tensor other) -> bool + tags: [data_dependent_output, pointwise] + variants: method, function + dispatch: + CPU: cpu_equal + CUDA: cuda_equal + MPS: mps_equal + QuantizedCPU: equal_quantized_cpu + +- func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: pow_Tensor_Tensor_out + MPS: pow_tensor_tensor_out_mps + tags: pointwise + +- func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Tensor_out + variants: method, function + tags: [core, pointwise] + +- func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: pow_Scalar_out + MPS: pow_Scalar_out_mps + tags: pointwise + +- func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: pow.Scalar_out + tags: pointwise + +- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: pow_Tensor_Scalar_out + SparseCPU, SparseCUDA: pow_out_sparse_scalar + MPS: pow_tensor_scalar_out_mps + tags: pointwise + +- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Scalar_out + variants: function, method + dispatch: + SparseCPU, SparseCUDA: pow_sparse_scalar + tags: [core, pointwise] + +- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Scalar_out + variants: method + tags: pointwise + +- func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: pow.Tensor_Tensor_out + variants: method + tags: pointwise + +- func: float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor + variants: function, method + tags: pointwise + +- func: float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor + tags: pointwise + +- func: float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) + tags: pointwise + +- func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor + variants: function, method + tags: pointwise + +- func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) + variants: method + tags: pointwise + +- func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) + variants: method + tags: pointwise + +- func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + variants: method + dispatch: + CPU, CUDA: normal_ + MPS: normal_mps_ + Meta: normal_meta_ + SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: normal_nested_ + autogen: normal.out + +# Only used by the functionalization pass. +# Normally, the codegen would be able to generate a normal() NativeFunction, +# but we can't due to overload ambiguity with normal.Tensor_float. +- func: normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor + device_check: NoCheck # TensorIterator + tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutograd: normal_functional + +- func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + tags: nondeterministic_seeded + dispatch: + CPU, CUDA: normal_out + MPS: normal_mps_out + Meta: normal_out_meta + +- func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal + MPS: normal_mps + Meta: normal_meta + tags: nondeterministic_seeded + +- func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out + tags: nondeterministic_seeded + +- func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal + MPS: normal_mps + Meta: normal_meta + tags: nondeterministic_seeded + +- func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out + tags: nondeterministic_seeded + +- func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal + MPS: normal_mps + Meta: normal_meta + tags: nondeterministic_seeded + +- func: normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: normal + tags: nondeterministic_seeded + +- func: normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: normal_out + tags: nondeterministic_seeded + +- func: alias(Tensor(a) self) -> Tensor(a) + variants: method, function + dispatch: + CompositeExplicitAutograd: alias + tags: core + +- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () + variants: function + dispatch: + CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ + autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out + +- func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!) + variants: function + dispatch: + CUDA: _amp_update_scale_cuda_ + autogen: _amp_update_scale, _amp_update_scale.out + + #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor + #dispatch: + #CPU: _cat_cpu + #CUDA: cat_cuda + #MPS: cat_mps + #QuantizedCPU: cat_quantized_cpu + + #- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + #dispatch: + #CPU: _cat_out_cpu + #CUDA: cat_out_cuda + #QuantizedCPU: cat_out_quantized_cpu + +- func: _foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_scalar_kernel_slow + CUDA: foreach_tensor_add_scalar_kernel_cuda + +- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_scalar_kernel_slow_ + CUDA: foreach_tensor_add_scalar_kernel_cuda_ + autogen: _foreach_add.Scalar_out + +- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_list_kernel_slow + CUDA: foreach_tensor_add_list_kernel_cuda + +- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_list_kernel_slow_ + CUDA: foreach_tensor_add_list_kernel_cuda_ + autogen: _foreach_add.List_out + +- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow + CUDA: foreach_tensor_add_scalarlist_kernel_cuda + +- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow_ + CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ + autogen: _foreach_add.ScalarList_out + +- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sub_scalar_kernel_slow + CUDA: foreach_tensor_sub_scalar_kernel_cuda + +- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sub_scalar_kernel_slow_ + CUDA: foreach_tensor_sub_scalar_kernel_cuda_ + autogen: _foreach_sub.Scalar_out + +- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sub_list_kernel_slow + CUDA: foreach_tensor_sub_list_kernel_cuda + +- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sub_list_kernel_slow_ + CUDA: foreach_tensor_sub_list_kernel_cuda_ + autogen: _foreach_sub.List_out + +- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda + +- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow_ + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ + autogen: _foreach_sub.ScalarList_out + +- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_mul_scalar_kernel_slow + CUDA: foreach_tensor_mul_scalar_kernel_cuda + +- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_mul_scalar_kernel_slow_ + CUDA: foreach_tensor_mul_scalar_kernel_cuda_ + autogen: _foreach_mul.Scalar_out + +- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_mul_list_kernel_slow + CUDA: foreach_tensor_mul_list_kernel_cuda + +- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_mul_list_kernel_slow_ + CUDA: foreach_tensor_mul_list_kernel_cuda_ + autogen: _foreach_mul.List_out + +- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda + +- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow_ + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ + autogen: _foreach_mul.ScalarList_out + +- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_scalar_kernel_slow + CUDA: foreach_tensor_div_scalar_kernel_cuda + +- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_scalar_kernel_slow_ + CUDA: foreach_tensor_div_scalar_kernel_cuda_ + autogen: _foreach_div.Scalar_out + +- func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_list_kernel_slow + CUDA: foreach_tensor_div_list_kernel_cuda + +- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_list_kernel_slow_ + CUDA: foreach_tensor_div_list_kernel_cuda_ + autogen: _foreach_div.List_out + +- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow + CUDA: foreach_tensor_div_scalarlist_kernel_cuda + +- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow_ + CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ + autogen: _foreach_div.ScalarList_out + +- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalar_kernel_slow + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda + +- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ + autogen: _foreach_clamp_max.Scalar_out + +- func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_list_kernel_slow + CUDA: foreach_tensor_clamp_max_list_kernel_cuda + +- func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_list_kernel_slow_ + CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ + autogen: _foreach_clamp_max.List_out + +- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda + +- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ + autogen: _foreach_clamp_max.ScalarList_out + +- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalar_kernel_slow + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda + +- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ + autogen: _foreach_clamp_min.Scalar_out + +- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_list_kernel_slow + CUDA: foreach_tensor_clamp_min_list_kernel_cuda + +- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_list_kernel_slow_ + CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ + autogen: _foreach_clamp_min.List_out + +- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda + +- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ + autogen: _foreach_clamp_min.ScalarList_out + +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalar_kernel_slow + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda + +- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ + autogen: _foreach_maximum.Scalar_out + +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_list_kernel_slow + CUDA: foreach_tensor_clamp_min_list_kernel_cuda + +- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_list_kernel_slow_ + CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ + autogen: _foreach_maximum.List_out + +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda + +- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ + autogen: _foreach_maximum.ScalarList_out + +- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalar_kernel_slow + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda + +- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ + autogen: _foreach_minimum.Scalar_out + +- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_list_kernel_slow + CUDA: foreach_tensor_clamp_max_list_kernel_cuda + +- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_list_kernel_slow_ + CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ + autogen: _foreach_minimum.List_out + +- func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda + +- func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ + autogen: _foreach_minimum.ScalarList_out + +- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcdiv_scalar_slow + CUDA: foreach_tensor_addcdiv_scalar_cuda + +- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcdiv_scalarlist_slow + CUDA: foreach_tensor_addcdiv_scalarlist_cuda + +- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcdiv_tensor_slow + CUDA: foreach_tensor_addcdiv_tensor_cuda + +- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcdiv_scalar_slow_ + CUDA: foreach_tensor_addcdiv_scalar_cuda_ + autogen: _foreach_addcdiv.Scalar_out + +- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcdiv_scalarlist_slow_ + CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ + autogen: _foreach_addcdiv.ScalarList_out + +- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcdiv_tensor_slow_ + CUDA: foreach_tensor_addcdiv_tensor_cuda_ + autogen: _foreach_addcdiv.Tensor_out + +- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalar_slow + CUDA: foreach_tensor_addcmul_scalar_cuda + +- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalarlist_slow + CUDA: foreach_tensor_addcmul_scalarlist_cuda + +- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_tensor_slow + CUDA: foreach_tensor_addcmul_tensor_cuda + +- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalar_slow_ + CUDA: foreach_tensor_addcmul_scalar_cuda_ + autogen: _foreach_addcmul.Scalar_out + +- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalarlist_slow_ + CUDA: foreach_tensor_addcmul_scalarlist_cuda_ + autogen: _foreach_addcmul.ScalarList_out + +- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_tensor_slow_ + CUDA: foreach_tensor_addcmul_tensor_cuda_ + autogen: _foreach_addcmul.Tensor_out + +- func: _foreach_abs(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_abs_slow + CUDA: foreach_tensor_abs_cuda + +- func: _foreach_abs_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_abs_slow_ + CUDA: foreach_tensor_abs_cuda_ + autogen: _foreach_abs.out + +- func: _foreach_acos(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_acos_slow + CUDA: foreach_tensor_acos_cuda + +- func: _foreach_acos_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_acos_slow_ + CUDA: foreach_tensor_acos_cuda_ + autogen: _foreach_acos.out + +- func: _foreach_asin(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_asin_slow + CUDA: foreach_tensor_asin_cuda + +- func: _foreach_asin_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_asin_slow_ + CUDA: foreach_tensor_asin_cuda_ + autogen: _foreach_asin.out + +- func: _foreach_atan(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_atan_slow + CUDA: foreach_tensor_atan_cuda + +- func: _foreach_atan_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_atan_slow_ + CUDA: foreach_tensor_atan_cuda_ + autogen: _foreach_atan.out + +- func: _foreach_ceil(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_ceil_slow + CUDA: foreach_tensor_ceil_cuda + +- func: _foreach_ceil_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_ceil_slow_ + CUDA: foreach_tensor_ceil_cuda_ + autogen: _foreach_ceil.out + +- func: _foreach_cos(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_cos_slow + CUDA: foreach_tensor_cos_cuda + +- func: _foreach_cos_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_cos_slow_ + CUDA: foreach_tensor_cos_cuda_ + autogen: _foreach_cos.out + +- func: _foreach_cosh(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_cosh_slow + CUDA: foreach_tensor_cosh_cuda + +- func: _foreach_cosh_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_cosh_slow_ + CUDA: foreach_tensor_cosh_cuda_ + autogen: _foreach_cosh.out + +- func: _foreach_erf(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_erf_slow + CUDA: foreach_tensor_erf_cuda + +- func: _foreach_erf_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_erf_slow_ + CUDA: foreach_tensor_erf_cuda_ + autogen: _foreach_erf.out + +- func: _foreach_erfc(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_erfc_slow + CUDA: foreach_tensor_erfc_cuda + +- func: _foreach_erfc_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_erfc_slow_ + CUDA: foreach_tensor_erfc_cuda_ + autogen: _foreach_erfc.out + +- func: _foreach_exp(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_exp_slow + CUDA: foreach_tensor_exp_cuda + +- func: _foreach_exp_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_exp_slow_ + CUDA: foreach_tensor_exp_cuda_ + autogen: _foreach_exp.out + +- func: _foreach_expm1(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_expm1_slow + CUDA: foreach_tensor_expm1_cuda + +- func: _foreach_expm1_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_expm1_slow_ + CUDA: foreach_tensor_expm1_cuda_ + autogen: _foreach_expm1.out + +- func: _foreach_floor(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_floor_slow + CUDA: foreach_tensor_floor_cuda + +- func: _foreach_floor_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_floor_slow_ + CUDA: foreach_tensor_floor_cuda_ + autogen: _foreach_floor.out + +- func: _foreach_frac(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_frac_slow + CUDA: foreach_tensor_frac_cuda + +- func: _foreach_frac_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_frac_slow_ + CUDA: foreach_tensor_frac_cuda_ + autogen: _foreach_frac.out + +- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_ternary_lerp_slow + CUDA: foreach_tensor_lerp_ternary_cuda + autogen: _foreach_lerp.List_out + +- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_ternary_lerp_slow_ + CUDA: foreach_tensor_lerp_ternary_cuda_ + autogen: _foreach_lerp.List_out + +- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lerp_list_kernel_slow + CUDA: foreach_tensor_lerp_list_cuda + autogen: _foreach_lerp.Scalar_out + +- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lerp_list_kernel_slow_ + CUDA: foreach_tensor_lerp_list_cuda_ + autogen: _foreach_lerp.Scalar_out + +- func: _foreach_lgamma(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lgamma_slow + CUDA: foreach_tensor_lgamma_cuda + +- func: _foreach_lgamma_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lgamma_slow_ + CUDA: foreach_tensor_lgamma_cuda_ + autogen: _foreach_lgamma.out + +- func: _foreach_log(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log_slow + CUDA: foreach_tensor_log_cuda + +- func: _foreach_log_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log_slow_ + CUDA: foreach_tensor_log_cuda_ + autogen: _foreach_log.out + +- func: _foreach_log10(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log10_slow + CUDA: foreach_tensor_log10_cuda + +- func: _foreach_log10_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log10_slow_ + CUDA: foreach_tensor_log10_cuda_ + autogen: _foreach_log10.out + +- func: _foreach_log1p(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log1p_slow + CUDA: foreach_tensor_log1p_cuda + +- func: _foreach_log1p_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log1p_slow_ + CUDA: foreach_tensor_log1p_cuda_ + autogen: _foreach_log1p.out + +- func: _foreach_log2(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log2_slow + CUDA: foreach_tensor_log2_cuda + +- func: _foreach_log2_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_log2_slow_ + CUDA: foreach_tensor_log2_cuda_ + autogen: _foreach_log2.out + +- func: _foreach_neg(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_neg_slow + CUDA: foreach_tensor_neg_cuda + +- func: _foreach_neg_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_neg_slow_ + CUDA: foreach_tensor_neg_cuda_ + autogen: _foreach_neg.out + +- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_norm_slow + CUDA: foreach_tensor_norm_cuda + autogen: _foreach_norm.Scalar_out + +- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_pow_list_kernel_slow + CUDA: foreach_tensor_pow_list_kernel_cuda + +- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_pow_scalar_kernel_slow + CUDA: foreach_tensor_pow_scalar_kernel_cuda + +- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_pow_scalarlist_kernel_slow + CUDA: foreach_tensor_pow_scalarlist_kernel_cuda + +- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_scalar_pow_list_kernel_slow + CUDA: foreach_scalar_pow_list_kernel_cuda + +- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CPU: foreach_tensor_pow_list_kernel_slow_ + CUDA: foreach_tensor_pow_list_kernel_cuda_ + autogen: _foreach_pow.List_out + +- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CPU: foreach_tensor_pow_scalar_kernel_slow_ + CUDA: foreach_tensor_pow_scalar_kernel_cuda_ + autogen: _foreach_pow.Scalar_out + +- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CPU: foreach_tensor_pow_scalarlist_kernel_slow_ + CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_ + autogen: _foreach_pow.ScalarList_out + +- func: _foreach_reciprocal(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_reciprocal_slow + CUDA: foreach_tensor_reciprocal_cuda + +- func: _foreach_reciprocal_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_reciprocal_slow_ + CUDA: foreach_tensor_reciprocal_cuda_ + autogen: _foreach_reciprocal.out + +- func: _foreach_round(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_round_slow + CUDA: foreach_tensor_round_cuda + +- func: _foreach_round_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_round_slow_ + CUDA: foreach_tensor_round_cuda_ + autogen: _foreach_round.out + +- func: _foreach_sigmoid(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sigmoid_slow + CUDA: foreach_tensor_sigmoid_cuda + +- func: _foreach_sigmoid_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sigmoid_slow_ + CUDA: foreach_tensor_sigmoid_cuda_ + autogen: _foreach_sigmoid.out + +- func: _foreach_sin(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sin_slow + CUDA: foreach_tensor_sin_cuda + +- func: _foreach_sin_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sin_slow_ + CUDA: foreach_tensor_sin_cuda_ + autogen: _foreach_sin.out + +- func: _foreach_sinh(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sinh_slow + CUDA: foreach_tensor_sinh_cuda + +- func: _foreach_sinh_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sinh_slow_ + CUDA: foreach_tensor_sinh_cuda_ + autogen: _foreach_sinh.out + +- func: _foreach_sqrt(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sqrt_slow + CUDA: foreach_tensor_sqrt_cuda + +- func: _foreach_sqrt_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_sqrt_slow_ + CUDA: foreach_tensor_sqrt_cuda_ + autogen: _foreach_sqrt.out + +- func: _foreach_tan(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_tan_slow + CUDA: foreach_tensor_tan_cuda + +- func: _foreach_tan_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_tan_slow_ + CUDA: foreach_tensor_tan_cuda_ + autogen: _foreach_tan.out + +- func: _foreach_tanh(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_tanh_slow + CUDA: foreach_tensor_tanh_cuda + +- func: _foreach_tanh_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_tanh_slow_ + CUDA: foreach_tensor_tanh_cuda_ + autogen: _foreach_tanh.out + +- func: _foreach_trunc(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_trunc_slow + CUDA: foreach_tensor_trunc_cuda + +- func: _foreach_trunc_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_trunc_slow_ + CUDA: foreach_tensor_trunc_cuda_ + autogen: _foreach_trunc.out + +- func: _foreach_zero_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_zero_slow_ + CUDA: foreach_tensor_zero_cuda_ + autogen: _foreach_zero, _foreach_zero.out + +- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor + dispatch: + CPU: bucketize_cpu + CUDA: bucketize_cuda + +- func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: bucketize_out_cpu + CUDA: bucketize_out_cuda + +- func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor + dispatch: + CPU: bucketize_cpu + CUDA: bucketize_cuda + autogen: bucketize.Scalar_out + +- func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor + dispatch: + CPU: searchsorted_cpu + CUDA: searchsorted_cuda + +- func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: searchsorted_out_cpu + CUDA: searchsorted_out_cuda + +- func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor + dispatch: + CPU: searchsorted_cpu + CUDA: searchsorted_cuda + autogen: searchsorted.Scalar_out + +- func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor + structured_delegate: _convert_indices_from_coo_to_csr.out + +- func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: _convert_indices_from_coo_to_csr_structured_cpu + CUDA: _convert_indices_from_coo_to_csr_structured_cuda + +- func: _convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor + structured_delegate: _convert_indices_from_csr_to_coo.out + +- func: _convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: _convert_indices_from_csr_to_coo_structured_cpu + CUDA: _convert_indices_from_csr_to_coo_structured_cuda + +## NN wrappers + +- func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: mse_loss_out + MPS: mse_loss_out_mps + +- func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: mse_loss.out + python_module: nn + +- func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: mse_loss_backward_out + MPS: mse_loss_backward_out_mps + +- func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: mse_loss_backward + MPS: mse_loss_backward_mps + +- func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + python_module: nn + +- func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu_out + CUDA: multi_margin_loss_cuda_out + +- func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu + CUDA: multi_margin_loss_cuda + +- func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu_backward_out + CUDA: multi_margin_loss_cuda_backward_out + +- func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor + python_module: nn + dispatch: + CPU: multi_margin_loss_cpu_backward + CUDA: multi_margin_loss_cuda_backward + +- func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + python_module: nn + +- func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!)) + python_module: nn + dispatch: + CPU: multilabel_margin_loss_forward_out_cpu + CUDA: multilabel_margin_loss_forward_out_cuda + +- func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target) + python_module: nn + dispatch: + CPU: multilabel_margin_loss_forward_cpu + CUDA: multilabel_margin_loss_forward_cuda + +- func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: multilabel_margin_loss_backward_cpu_out + CUDA: multilabel_margin_loss_backward_cuda_out + +- func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor + python_module: nn + dispatch: + CPU: multilabel_margin_loss_backward_cpu + CUDA: multilabel_margin_loss_backward_cuda + +- func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: nll_loss_nd_symint + +- func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: nll_loss_symint + +- func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: nll_loss_forward_out_cpu + CUDA: nll_loss_forward_out_cuda + MPS: nll_loss_forward_out_mps + +- func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight) + python_module: nn + structured_delegate: nll_loss_forward.output + +- func: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: nll_loss_backward_out_cpu + CUDA: nll_loss_backward_out_cuda + MPS: nll_loss_backward_out_mps + +- func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor + python_module: nn + structured_delegate: nll_loss_backward.grad_input + +- func: nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: nll_loss2d_symint + +- func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) + python_module: nn + dispatch: + CPU: nll_loss2d_forward_out_cpu + CUDA: nll_loss2d_forward_out_cuda + MPS: nll_loss2d_forward_out_mps + +- func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight) + python_module: nn + dispatch: + CPU: nll_loss2d_forward_cpu + CUDA: nll_loss2d_forward_cuda + MPS: nll_loss2d_forward_mps + +- func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: nll_loss2d_backward_out_cpu + CUDA: nll_loss2d_backward_out_cuda + MPS: nll_loss2d_backward_out_mps + +- func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor + python_module: nn + dispatch: + CPU: nll_loss2d_backward_cpu + CUDA: nll_loss2d_backward_cuda + MPS: nll_loss2d_backward_mps + +- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: smooth_l1_loss_out + MPS: smooth_l1_loss_out_mps + +- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: smooth_l1_loss.out + python_module: nn + +- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: smooth_l1_loss_backward_out + CUDA: smooth_l1_loss_backward_out + MPS: smooth_l1_loss_backward_out_mps + +- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: smooth_l1_loss_backward + +- func: huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: huber_loss_out + MPS: huber_loss_out_mps + +- func: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: huber_loss + MPS: huber_loss_mps + +- func: huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: huber_loss_backward_out + MPS: huber_loss_backward_out_mps + +- func: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: huber_loss_backward + +- func: soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss_out + +- func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss + +- func: soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss_backward_out + +- func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: soft_margin_loss_backward + +- func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: elu_out + MPS: elu_out_mps + +- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor + structured_delegate: elu.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: elu_backward_out + MPS: elu_backward_out_mps + +- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor + structured_delegate: elu_backward.grad_input + python_module: nn + +- func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) + structured_delegate: elu.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: glu_out + MPS: glu_out_mps + +- func: glu(Tensor self, int dim=-1) -> Tensor + structured_delegate: glu.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: glu_backward_cpu_out + CUDA: glu_backward_cuda_out + MPS: glu_backward_mps_out + +- func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor + python_module: nn + dispatch: + CPU: glu_backward_cpu + CUDA: glu_backward_cuda + MPS: glu_backward_mps + +- func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_jvp + autogen: glu_jvp.out + +- func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_backward_jvp + autogen: glu_backward_jvp.out + +- func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_out + MPS: hardsigmoid_out_mps + QuantizedCPU: hardsigmoid_out_quantized_cpu + +- func: hardsigmoid(Tensor self) -> Tensor + structured_delegate: hardsigmoid.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + QuantizedCPU: hardsigmoid_quantized_cpu + +- func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) + structured_delegate: hardsigmoid.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_backward_out + MPS: hardsigmoid_backward_out_mps + +- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor + structured_delegate: hardsigmoid_backward.grad_input + python_module: nn + +- func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA, MPS: hardtanh_out + QuantizedCPU: hardtanh_out_quantized_cpu + +- func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA, MPS: hardtanh + QuantizedCPU: hardtanh_quantized_cpu + tags: core + +- func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward_out + MPS: hardtanh_backward_out_mps + +- func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward + MPS: hardtanh_backward_mps + +- func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA, MPS: hardtanh_ + QuantizedCPU: hardtanh_quantized_cpu_ + +- func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardswish_out + MPS: hardswish_out_mps + +- func: hardswish(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardswish + MPS: hardswish_mps + +- func: hardswish_(Tensor(a!) self) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: hardswish_ + MPS: hardswish_mps_ + +- func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: hardswish_backward + MPS: hardswish_backward_mps + autogen: hardswish_backward.out + +- func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: leaky_relu_out + MPS: leaky_relu_out_mps + QuantizedCPU: leaky_relu_out_quantized_cpu + +- func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor + structured_delegate: leaky_relu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + QuantizedCPU: leaky_relu_quantized_cpu + tags: core + +- func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: leaky_relu_backward_out + MPS: leaky_relu_backward_out_mps + +- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor + structured_delegate: leaky_relu_backward.grad_input + python_module: nn + +- func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) + structured_delegate: leaky_relu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + QuantizedCPU: leaky_relu_quantized_cpu_ + +- func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: nn + +- func: log_sigmoid(Tensor self) -> Tensor + device_check: NoCheck # TensorIterator + python_module: nn + +- func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!)) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU: log_sigmoid_forward_out_cpu + CUDA: log_sigmoid_forward_out_cuda + MPS: log_sigmoid_forward_out_mps + +- func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer) + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU: log_sigmoid_forward_cpu + CUDA: log_sigmoid_forward_cuda + MPS: log_sigmoid_forward_mps + +- func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: log_sigmoid_backward_cpu_out + CUDA: log_sigmoid_backward_cuda_out + MPS: log_sigmoid_backward_mps_out + +- func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor + python_module: nn + dispatch: + CPU: log_sigmoid_backward_cpu + CUDA: log_sigmoid_backward_cuda + MPS: log_sigmoid_backward_mps + +- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + tags: nondeterministic_seeded + dispatch: + CPU: rrelu_with_noise_out_cpu + CUDA: rrelu_with_noise_out_cuda + +- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor + python_module: nn + dispatch: + CPU: rrelu_with_noise_cpu + CUDA: rrelu_with_noise_cuda + tags: nondeterministic_seeded + +- func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: rrelu_with_noise_backward + autogen: rrelu_with_noise_backward.out + +- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) + python_module: nn + tags: nondeterministic_seeded + dispatch: + CPU: rrelu_with_noise_cpu_ + CUDA: rrelu_with_noise_cuda_ + +- func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: softplus_out + MPS: softplus_out_mps + +- func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor + structured_delegate: softplus.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: softplus_backward_out + MPS: softplus_backward_out_mps + +- func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor + structured_delegate: softplus_backward.grad_input + python_module: nn + +- func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU, CUDA: softshrink_out + +- func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor + structured_delegate: softshrink.out + device_check: NoCheck # TensorIterator + python_module: nn + +- func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: softshrink_backward_out + +- func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor + structured_delegate: softshrink_backward.grad_input + python_module: nn + +- func: adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: adaptive_avg_pool2d_out_cpu + CUDA: adaptive_avg_pool2d_out_cuda + MPS: adaptive_avg_pool2d_out_mps + MkldnnCPU: mkldnn_adaptive_avg_pool2d_out_stub + +- func: adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: adaptive_avg_pool2d_symint + +- func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor + dispatch: + MkldnnCPU: mkldnn_adaptive_avg_pool2d + +- func: mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + MkldnnCPU: mkldnn_adaptive_avg_pool2d_out + +- func: mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor + dispatch: + MkldnnCPU: mkldnn_adaptive_avg_pool2d_backward + autogen: mkldnn_adaptive_avg_pool2d_backward.out + +- func: _adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor + dispatch: + CPU: adaptive_avg_pool2d_cpu + CUDA: adaptive_avg_pool2d_cuda + MPS: adaptive_avg_pool2d_mps + QuantizedCPU: adaptive_avg_pool2d_quantized_cpu + QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda + autogen: _adaptive_avg_pool2d.out + tags: core + +- func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU: adaptive_avg_pool2d_backward_cpu + CUDA: adaptive_avg_pool2d_backward_cuda + MPS: adaptive_avg_pool2d_backward_mps + autogen: _adaptive_avg_pool2d_backward.out + tags: core + +- func: adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_out_cpu + CUDA: adaptive_avg_pool3d_out_cuda + QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu + +- func: adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: adaptive_avg_pool3d_symint + +- func: _adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor + dispatch: + CPU: adaptive_avg_pool3d_cpu + CUDA: adaptive_avg_pool3d_cuda + QuantizedCPU: adaptive_avg_pool3d_quantized_cpu + autogen: _adaptive_avg_pool3d.out + +- func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_backward_out_cpu + CUDA: adaptive_avg_pool3d_backward_out_cuda + +- func: _adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor + python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_backward_cpu + CUDA: adaptive_avg_pool3d_backward_cuda + autogen: _adaptive_avg_pool3d_backward.out + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool2d_out_cpu + CUDA: adaptive_max_pool2d_out_cuda + MPS: adaptive_max_pool2d_out_mps + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor) + python_module: nn + structured_delegate: adaptive_max_pool2d.out + +- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool2d_backward_out_cpu + CUDA: adaptive_max_pool2d_backward_out_cuda + MPS: adaptive_max_pool2d_backward_out_mps + +- func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor + python_module: nn + structured_delegate: adaptive_max_pool2d_backward.grad_input + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool3d_out_cpu + CUDA: adaptive_max_pool3d_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor) + python_module: nn + structured_delegate: adaptive_max_pool3d.out + +- func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: adaptive_max_pool3d_backward_out_cpu + CUDA: adaptive_max_pool3d_backward_out_cuda + +- func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor + python_module: nn + structured_delegate: adaptive_max_pool3d_backward.grad_input + +- func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + precomputed: + - kernel_size -> int kH, int kW + - stride -> int dH, int dW + - padding -> int padH, int padW + dispatch: + CPU: avg_pool2d_out_cpu + CUDA: avg_pool2d_out_cuda + MPS: avg_pool2d_out_mps + MkldnnCPU: mkldnn_avg_pool2d_out + +- func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor + python_module: nn + structured_delegate: avg_pool2d.out + dispatch: + MkldnnCPU: mkldnn_avg_pool2d + QuantizedCPU: avg_pool2d_quantized_cpu + tags: core + +- func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: avg_pool2d_backward_out_cpu + CUDA: avg_pool2d_backward_out_cuda + MPS: avg_pool2d_backward_out_mps + MkldnnCPU: mkldnn_avg_pool2d_backward_out + +- func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor + python_module: nn + structured_delegate: avg_pool2d_backward.grad_input + dispatch: + MkldnnCPU: mkldnn_avg_pool2d_backward + tags: core + +- func: avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: avg_pool3d_out_cpu + CUDA: avg_pool3d_out_cuda + MkldnnCPU: mkldnn_avg_pool3d_out + +- func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor + python_module: nn + structured_delegate: avg_pool3d.out + dispatch: + MkldnnCPU: mkldnn_avg_pool3d + QuantizedCPU: avg_pool3d_quantized_cpu + +- func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: avg_pool3d_backward_out_cpu + CUDA: avg_pool3d_backward_out_cuda + MkldnnCPU: mkldnn_avg_pool3d_backward_out + +- func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor + python_module: nn + structured_delegate: avg_pool3d_backward.grad_input + dispatch: + MkldnnCPU: mkldnn_avg_pool3d_backward + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: fractional_max_pool2d_out_cpu + CUDA: fractional_max_pool2d_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor) + python_module: nn + structured_delegate: fractional_max_pool2d.output + +- func: fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: fractional_max_pool2d_backward_cpu + CUDA: fractional_max_pool2d_backward_cuda + +- func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor + python_module: nn + structured_delegate: fractional_max_pool2d_backward.grad_input + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + precomputed: + - kernel_size -> int poolSizeT, int poolSizeH, int poolSizeW + - output_size -> int outputT, int outputH, int outputW + - int numBatch, int numPlanes, int inputT, int inputH, int inputW + dispatch: + CPU: fractional_max_pool3d_out_cpu + CUDA: fractional_max_pool3d_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor) + python_module: nn + structured_delegate: fractional_max_pool3d.output + +- func: fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: fractional_max_pool3d_backward_out_cpu + CUDA: fractional_max_pool3d_backward_out_cuda + +- func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor + python_module: nn + dispatch: + CPU: fractional_max_pool3d_backward_cpu + CUDA: fractional_max_pool3d_backward_cuda + +# Return: (Tensor output, Tensor indices) +- func: max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + structured: True + dispatch: + CPU: max_pool2d_with_indices_out_cpu + CUDA: max_pool2d_with_indices_out_cuda + MPS: max_pool2d_with_indices_out_mps + +# Return: (Tensor output, Tensor indices) +- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) + python_module: nn + structured_delegate: max_pool2d_with_indices.out + tags: core + +- func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: max_pool2d_with_indices_backward_out_cpu + CUDA: max_pool2d_with_indices_backward_out_cuda + MPS: max_pool2d_with_indices_backward_out_mps + +- func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor + python_module: nn + structured_delegate: max_pool2d_with_indices_backward.grad_input + tags: core + +# Return: (Tensor output, Tensor indices) +- func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_out_cpu + CUDA: max_pool3d_with_indices_out_cuda + +# Return: (Tensor output, Tensor indices) +- func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_cpu + CUDA: max_pool3d_with_indices_cuda + tags: core + +- func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_backward_out_cpu + CUDA: max_pool3d_with_indices_backward_out_cuda + +- func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor + python_module: nn + dispatch: + CPU: max_pool3d_with_indices_backward_cpu + CUDA: max_pool3d_with_indices_backward_cuda + +- func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: max_unpooling2d_forward_out_cpu + CUDA: max_unpooling2d_forward_out_cuda + +- func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor + python_module: nn + dispatch: + CPU: max_unpooling2d_forward_cpu + CUDA: max_unpooling2d_forward_cuda + +- func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: max_unpooling3d_forward_out_cpu + CUDA: max_unpooling3d_forward_out_cuda + +- func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor + python_module: nn + dispatch: + CPU: max_unpooling3d_forward_cpu + CUDA: max_unpooling3d_forward_cuda + +- func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad1d_out_cpu + QuantizedCPU: reflection_pad1d_out_quantized_cpu + CUDA: reflection_pad1d_out_cuda + MPS: reflection_pad1d_out_mps + +- func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad1d.out + +- func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad1d_backward_out_cpu + CUDA: reflection_pad1d_backward_out_cuda + MPS: reflection_pad1d_backward_out_mps + +- func: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad1d_backward.grad_input + +- func: reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU, QuantizedCPU: reflection_pad2d_out_cpu + CUDA: reflection_pad2d_out_cuda + MPS: reflection_pad2d_out_mps + +- func: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + dispatch: + CPU: reflection_pad2d_cpu + QuantizedCPU: reflection_pad2d_quantized_cpu + CUDA: reflection_pad2d_cuda + MPS: reflection_pad2d_mps + tags: core + +- func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: reflection_pad2d_backward_out_cpu + CUDA: reflection_pad2d_backward_out_cuda + MPS: reflection_pad2d_backward_out_mps + +- func: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + dispatch: + CPU: reflection_pad2d_backward_cpu + CUDA: reflection_pad2d_backward_cuda + MPS: reflection_pad2d_backward_mps + +- func: reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad3d_out_cpu + CUDA: reflection_pad3d_out_cuda + MPS: reflection_pad3d_out_mps + +- func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad3d.out + +- func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad3d_backward_out_cpu + CUDA: reflection_pad3d_backward_out_cuda + MPS: reflection_pad3d_backward_out_mps + +- func: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad3d_backward.grad_input + +- func: replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad1d_out_cpu + CUDA: replication_pad1d_out_cuda + MPS: replication_pad1d_out_mps + +- func: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad1d.out + +- func: replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad1d_backward_out_cpu + CUDA: replication_pad1d_backward_out_cuda + MPS: replication_pad1d_backward_out_mps + +- func: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad1d_backward.grad_input + +- func: replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad2d_out_cpu + CUDA: replication_pad2d_out_cuda + MPS: replication_pad2d_out_mps + +- func: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad2d.out + tags: core + +- func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: replication_pad2d_backward_out_cpu + CUDA: replication_pad2d_backward_out_cuda + MPS: replication_pad2d_backward_out_mps + +- func: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor + python_module: nn + dispatch: + CPU: replication_pad2d_backward_cpu + CUDA: replication_pad2d_backward_cuda + MPS: replication_pad2d_backward_mps + +- func: replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: replication_pad3d_out_cpu + CUDA: replication_pad3d_out_cuda + MPS: replication_pad3d_out_mps + +- func: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + structured_delegate: replication_pad3d.out + tags: core + + +- func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: replication_pad3d_backward_out_cpu + CUDA: replication_pad3d_backward_out_cuda + MPS: replication_pad3d_backward_out_mps + +- func: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor + python_module: nn + dispatch: + CPU: replication_pad3d_backward_cpu + CUDA: replication_pad3d_backward_cuda + MPS: replication_pad3d_backward_mps + +- func: _pad_circular(Tensor self, SymInt[] pad) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: _pad_circular_symint + +- func: _pad_enum(Tensor self, SymInt[] pad, int mode, float? value=None) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: _pad_enum_symint + +- func: pad(Tensor self, SymInt[] pad, str mode="constant", float? value=None) -> Tensor + python_module: nn + dispatch: + CompositeImplicitAutograd: pad_symint + +- func: upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_linear1d.vec_out + +- func: upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_bilinear2d.vec_out + tags: core + +- func: _upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_bilinear2d_aa.vec_out + +- func: upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_trilinear3d.vec_out + +- func: upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_bicubic2d.vec_out + +- func: _upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_bicubic2d_aa.vec_out + +- func: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_nearest1d.vec_out + +- func: _upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_nearest_exact1d.vec_out + +- func: upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_nearest2d.vec_out + tags: core + +- func: _upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_nearest_exact2d.vec_out + +- func: upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: upsample_nearest3d.vec_out + +- func: _upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor + python_module: nn + autogen: _upsample_nearest_exact3d.vec_out + +# NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility. +- func: upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_linear1d_out_cpu + CUDA: upsample_linear1d_out_cuda + +- func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_linear1d.out + +- func: upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_linear1d_backward_out_cpu + CUDA: upsample_linear1d_backward_out_cuda + +- func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_linear1d_backward.grad_input + +- func: upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bilinear2d_out_cpu + CUDA: upsample_bilinear2d_out_cuda + MPS: upsample_bilinear2d_out_mps + +- func: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bilinear2d.out + dispatch: + QuantizedCPU: upsample_bilinear2d_quantized_cpu + +- func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bilinear2d_backward_out_cpu + CUDA: upsample_bilinear2d_backward_out_cuda + MPS: upsample_bilinear2d_backward_out_mps + +- func: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bilinear2d_backward.grad_input + +- func: _upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bilinear2d_aa_out_cpu + CUDA: _upsample_bilinear2d_aa_out_cuda + +- func: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bilinear2d_aa.out + +- func: _upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bilinear2d_aa_backward_out_cpu + CUDA: _upsample_bilinear2d_aa_backward_out_cuda + +- func: _upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bilinear2d_aa_backward.grad_input + +- func: upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bicubic2d_out_cpu + CUDA: upsample_bicubic2d_out_cuda + +- func: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bicubic2d.out + +- func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_bicubic2d_backward_out_cpu + CUDA: upsample_bicubic2d_backward_out_cuda + +- func: upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_bicubic2d_backward.grad_input + +- func: _upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bicubic2d_aa_out_cpu + CUDA: _upsample_bicubic2d_aa_out_cuda + +- func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bicubic2d_aa.out + +- func: _upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_bicubic2d_aa_backward_out_cpu + CUDA: _upsample_bicubic2d_aa_backward_out_cuda + +- func: _upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_bicubic2d_aa_backward.grad_input + +- func: upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_trilinear3d_out_cpu + CUDA: upsample_trilinear3d_out_cuda + +- func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_trilinear3d.out + +- func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_trilinear3d_backward_out_cpu + CUDA: upsample_trilinear3d_backward_out_cuda + +- func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_trilinear3d_backward.grad_input + +- func: upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest1d_out_cpu + CUDA: upsample_nearest1d_out_cuda + MPS: upsample_nearest1d_out_mps + +- func: _upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact1d_out_cpu + CUDA: _upsample_nearest_exact1d_out_cuda + MPS: _upsample_nearest_exact1d_out_mps + +- func: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest1d.out + +- func: _upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact1d.out + +- func: upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest1d_backward_out_cpu + CUDA: upsample_nearest1d_backward_out_cuda + MPS: upsample_nearest1d_backward_out_mps + +- func: _upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact1d_backward_out_cpu + CUDA: _upsample_nearest_exact1d_backward_out_cuda + MPS: _upsample_nearest_exact1d_backward_out_mps + +- func: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest1d_backward.grad_input + +- func: _upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact1d_backward.grad_input + +- func: upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest2d_out_cpu + CUDA: upsample_nearest2d_out_cuda + MPS: upsample_nearest2d_out_mps + +- func: _upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact2d_out_cpu + CUDA: _upsample_nearest_exact2d_out_cuda + MPS: _upsample_nearest_exact2d_out_mps + +- func: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest2d.out + dispatch: + QuantizedCPU: upsample_nearest2d_quantized_cpu + +- func: _upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact2d.out + dispatch: + QuantizedCPU: _upsample_nearest_exact2d_quantized_cpu + +- func: upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest2d_backward_out_cpu + CUDA: upsample_nearest2d_backward_out_cuda + MPS: upsample_nearest2d_backward_out_mps + +- func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact2d_backward_out_cpu + CUDA: _upsample_nearest_exact2d_backward_out_cuda + MPS: _upsample_nearest_exact2d_backward_out_mps + +- func: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest2d_backward.grad_input + +- func: _upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact2d_backward.grad_input + +- func: upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest3d_out_cpu + CUDA: upsample_nearest3d_out_cuda + +- func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact3d_out_cpu + CUDA: _upsample_nearest_exact3d_out_cuda + +- func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest3d.out + dispatch: + QuantizedCPU: upsample_nearest3d_quantized_cpu + +- func: _upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact3d.out + dispatch: + QuantizedCPU: _upsample_nearest_exact3d_quantized_cpu + +- func: upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: upsample_nearest3d_backward_out_cpu + CUDA: upsample_nearest3d_backward_out_cuda + +- func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: _upsample_nearest_exact3d_backward_out_cpu + CUDA: _upsample_nearest_exact3d_backward_out_cuda + +- func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: upsample_nearest3d_backward.grad_input + +- func: _upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + structured_delegate: _upsample_nearest_exact3d_backward.grad_input + +- func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: sigmoid_backward_out + MPS: sigmoid_backward_out_mps + tags: pointwise + +- func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor + python_module: nn + structured_delegate: sigmoid_backward.grad_input + tags: pointwise + +- func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: logit_backward_out + MPS: logit_backward_out_mps + tags: pointwise + +- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor + python_module: nn + structured_delegate: logit_backward.grad_input + tags: pointwise + +- func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: tanh_backward_out + MPS: tanh_backward_out_mps + tags: pointwise + +- func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor + python_module: nn + structured_delegate: tanh_backward.grad_input + +# What's a thnn_conv_ versus a slow_conv_? +# +# Historically, we have inefficient implementations of convolutions +# coming from the THNN/THCUNN library. These convolutions typically +# operated by computing the Toeplitz matrix and then doing a matrix +# multiply with the input; this is very memory inefficient! However, +# occasionally, we really don't have anything better, so it's helpful +# to have these fallbacks when there is no more optimized implementation +# in cudnn or mkldnn, etc. Both thnn_ and slow_ convolutions fall +# into this bucket. +# +# The difference between these two designations, is that thnn_ refers +# to a convolution that is still written in the "legacy" style; that is, +# C code in the THNN/ or THCUNN/ directory. A slow_ convolution is +# one that is written in the native style: modern C++. Algorithmically, +# these are the same thing, but we give them different prefixes to +# make the operational distinction clear. + tags: pointwise + +- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: slow_conv_transpose2d_structured_cpu + CUDA: slow_conv_transpose2d_structured_cuda + +- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor + python_module: nn + structured_delegate: slow_conv_transpose2d.out + +- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: slow_conv_transpose3d_out_cpu + CUDA: slow_conv_transpose3d_out_cuda + +- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv_transpose3d_cpu + CUDA: slow_conv_transpose3d_cuda + +- func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: thnn_conv2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0) -> Tensor + python_module: nn + +- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output) -> Tensor(a!) + python_module: nn + dispatch: + CPU: slow_conv2d_forward_out_cpu + CUDA: slow_conv2d_forward_out_cuda + +- func: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv2d_forward_cpu + CUDA: slow_conv2d_forward_cuda + +- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + python_module: nn + dispatch: + CPU: slow_conv2d_backward_out_cpu + CUDA: slow_conv2d_backward_out_cuda + +- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) + python_module: nn + dispatch: + CPU: slow_conv2d_backward_cpu + CUDA: slow_conv2d_backward_cuda + autogen: _slow_conv2d_backward.output_mask_out + +- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: nn + dispatch: + CUDA: conv_depthwise2d_cuda_out + +- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor + python_module: nn + dispatch: + CUDA: conv_depthwise2d_cuda + +- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor + python_module: nn + dispatch: + CUDA: conv_depthwise3d_cuda + autogen: conv_depthwise3d.out + +- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + +- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor + python_module: nn + +- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!) + python_module: nn + dispatch: + CPU: slow_conv3d_forward_out_cpu + +- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv3d_forward_cpu + +- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv_dilated2d_cpu + CUDA: slow_conv_dilated2d_cuda + autogen: slow_conv_dilated2d.out + +- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor + python_module: nn + dispatch: + CPU: slow_conv_dilated3d_cpu + CUDA: slow_conv_dilated3d_cuda + autogen: slow_conv_dilated3d.out + +- func: col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: col2im_out_cpu + CUDA: col2im_out_cuda + +- func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor + python_module: nn + dispatch: + CPU: col2im_cpu + CUDA: col2im_cuda + tags: core + +- func: column_stack(Tensor[] tensors) -> Tensor + +- func: column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU: im2col_out_cpu + CUDA: im2col_out_cuda + +- func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor + python_module: nn + dispatch: + CPU: im2col_cpu + CUDA: im2col_cuda + +- func: isfinite(Tensor self) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + +- func: isinf(Tensor self) -> Tensor + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: isinf + SparseCPU, SparseCUDA: isinf_sparse + SparseMeta: isinf_sparse_meta + SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr + autogen: isinf.out + tags: core + +- func: record_stream(Tensor(a!) self, Stream s) -> () + variants: method + dispatch: + CUDA: record_stream_cuda + +- func: isposinf(Tensor self) -> Tensor + variants: function, method + structured_delegate: isposinf.out + dispatch: + SparseCPU, SparseCUDA: isposinf_sparse + SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr + tags: pointwise + +- func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: isposinf_out + SparseCPU, SparseCUDA: isposinf_sparse_out + SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out + tags: pointwise + +- func: isneginf(Tensor self) -> Tensor + variants: function, method + structured_delegate: isneginf.out + dispatch: + SparseCPU, SparseCUDA: isneginf_sparse + SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr + tags: pointwise + +- func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: isneginf_out + SparseCPU, SparseCUDA: isneginf_sparse_out + SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out + tags: pointwise + +# NOTE [_add_batch_dim and _remove_batch_dim] +# _add_batch_dim and _remove_batch_dim are meant to be used in the implementation +# of the vmap frontend API (see torch/_vmap_internals.py). They are not +# user-facing, hence the leading underscore. Please don't use them them anywhere else. +- func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor + variants: function + +# See NOTE [_add_batch_dim and _remove_batch_dim] +- func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor + variants: function + +## Functions related to the `torch.special` namespace +# Note [special namespace binding] +# Functions in the special python module should have their names start with +# "special_" underscore and be bound to the desired Python name in +# torch/special/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/special.h. +# The "special_" names should be hidden from the user and not documented. + +- func: special_entr(Tensor self) -> Tensor + structured_delegate: special_entr.out + python_module: special + variants: function + tags: pointwise + +- func: special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_entr_out + tags: pointwise + +- func: special_ndtri(Tensor self) -> Tensor + structured_delegate: special_ndtri.out + python_module: special + variants: function + tags: pointwise + +- func: special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_ndtri_out + tags: pointwise + +- func: special_log_ndtr(Tensor self) -> Tensor + structured_delegate: special_log_ndtr.out + python_module: special + variants: function + tags: pointwise + +- func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_log_ndtr_out + tags: pointwise + +- func: special_expm1(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_exp2(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_psi(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_digamma(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammaln(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_erf(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_erfc(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_erfcx(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_erfcx.out + tags: pointwise + +- func: special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_erfcx_out + tags: pointwise + +- func: special_erfinv(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_ndtr(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_xlog1py(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + structured_delegate: special_xlog1py.out + tags: pointwise + +- func: special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py + tags: pointwise + +- func: special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py + tags: pointwise + +- func: special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_xlog1py_out + tags: pointwise + +- func: special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py_out + tags: pointwise + +- func: special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_xlog1py_out + tags: pointwise + +- func: special_xlogy(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_zeta(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + structured_delegate: special_zeta.out + tags: pointwise + +- func: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta + tags: pointwise + +- func: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta + tags: pointwise + +- func: special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_zeta_out + tags: pointwise + +- func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta_out + tags: pointwise + +- func: special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta_out + tags: pointwise + +- func: special_i0(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_i0e(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i0e.out + tags: pointwise + +- func: special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i0e_out + tags: pointwise + +- func: special_i1(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i1.out + tags: pointwise + +- func: special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i1_out + tags: pointwise + +- func: special_i1e(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i1e.out + tags: pointwise + +- func: special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i1e_out + tags: pointwise + +- func: special_logit(Tensor self, float? eps=None) -> Tensor + python_module: special + variants: function + +- func: special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_polygamma(int n, Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor + python_module: special + variants: function + +- func: special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_expit(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_sinc(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_round(Tensor self, *, int decimals=0) -> Tensor + python_module: special + variants: function + +- func: special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_log1p(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + python_module: special + variants: function + +- func: special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammainc(Tensor self, Tensor other) -> Tensor + python_module: special + variants: function + +- func: special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammaincc(Tensor self, Tensor other) -> Tensor + python_module: special + variants: function + +- func: special_multigammaln(Tensor self, int p) -> Tensor + python_module: special + variants: function + +- func: special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + python_module: special + variants: function + +## Functions related to the fast Fourier transform and the torch.fft namespace +# Note [FFT namespace binding] +# Functions in the fft python module should have their names start with +# "fft_" underscore and be bound to the desired Python name in +# torch/fft/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/fft.h. +# The "fft_" names should be hidden from the user and not documented. +# +# See fft_fft as an example. + +# torch.fft.fft +# NOTE: NOT an alias for torch.fft, which has different semantics +- func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft_symint + +- func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft_symint_out + +- func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft_symint + +- func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft_symint_out + +- func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft_symint + +- func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft_symint_out + +- func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft_symint + +- func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft_symint_out + +- func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft_symint + +- func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft_symint_out + +- func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft_symint + +- func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft_symint_out + +- func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft2_symint + +- func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fft2_symint_out + +- func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft2_symint + +- func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft2_symint_out + +- func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft2_symint + +- func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft2_symint_out + +- func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft2_symint + +- func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft2_symint_out + +- func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft2_symint + +- func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft2_symint_out + +- func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft2_symint + +- func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft2_symint_out + +- func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fftn_symint + +- func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_fftn_symint_out + +- func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifftn_symint + +- func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ifftn_symint_out + +- func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfftn_symint + +- func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_rfftn_symint_out + +- func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfftn_symint + +- func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_irfftn_symint_out + +- func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfftn_symint + +- func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_hfftn_symint_out + +- func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfftn_symint + +- func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + python_module: fft + variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfftn_symint_out + +- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_fftfreq + +- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_fftfreq_out + +- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_rfftfreq + +- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + python_module: fft + variants: function + dispatch: + CompositeExplicitAutograd: fft_rfftfreq_out + +- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + variants: function + +- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + variants: function + +## Functions for linear algebra and the torch.linalg namespace +# Note [linalg namespace binding] +# Functions in the linalg python module should have their names start with +# "linalg_" and be bound to the desired Python name in +# torch/linalg/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/linalg.h. +# The "linalg_" names should be hidden from the user and not documented. +# +# See linalg_det as an example. + +# "_ex" stands for experimental +- func: linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info) + python_module: linalg + structured_delegate: linalg_cholesky_ex.L + +- func: linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_cholesky_ex_out + +- func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor + python_module: linalg + +- func: linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor + python_module: linalg + variants: function + structured_delegate: linalg_cross.out + dispatch: + ZeroTensor: linalg_cross_zerotensor + +- func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + structured: True + dispatch: + CPU, CUDA, MPS: linalg_cross_out + +# linalg.lu_factor +- func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots) + python_module: linalg + variants: function + +- func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots) + python_module: linalg + variants: function + +- func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info) + python_module: linalg + structured_delegate: linalg_lu_factor_ex.out + variants: function + +- func: linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_factor_ex_out + +# linalg.lu +- func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U) + python_module: linalg + structured_delegate: linalg_lu.out + variants: function + +- func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_out + +# linalg.lu_solve +- func: linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor + python_module: linalg + structured_delegate: linalg_lu_solve.out + variants: function + +- func: linalg_lu_solve.out(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_solve_out + +# linalg.det +- func: _linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots) + structured_delegate: _linalg_det.result + +- func: _linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) + structured: True + dispatch: + CPU, CUDA: _linalg_det_out + +- func: linalg_det(Tensor A) -> Tensor + python_module: linalg + variants: function + +- func: linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +# torch.det, alias for torch.linalg.det +- func: det(Tensor self) -> Tensor + variants: function, method + +- func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info) + structured_delegate: linalg_ldl_factor_ex.out + python_module: linalg + variants: function + +- func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_factor_ex_out + +- func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor + structured_delegate: linalg_ldl_solve.out + python_module: linalg + variants: function + +- func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_solve_out + +- func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values) + python_module: linalg + variants: function + dispatch: + CompositeExplicitAutograd: linalg_lstsq + tags: dynamic_output_shape + +- func: linalg_lstsq.out(Tensor self, Tensor b, float? rcond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_lstsq_out + tags: dynamic_output_shape + +# torch.linalg.matmul, alias for torch.matmul +- func: linalg_matmul(Tensor self, Tensor other) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_vecdot(Tensor x, Tensor y, *, int dim=-1) -> Tensor + python_module: linalg + variants: function + +- func: linalg_vecdot.out(Tensor x, Tensor y, *, int dim=-1, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_matrix_exp(Tensor self) -> Tensor + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_matrix_exp + autogen: linalg_matrix_exp.out + +- func: _linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots) + structured_delegate: _linalg_slogdet.sign + +- func: _linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) + structured: True + dispatch: + CPU, CUDA: _linalg_slogdet_out + +- func: linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet) + python_module: linalg + +- func: linalg_slogdet.out(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet) + python_module: linalg + +- func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) + variants: function, method + +- func: slogdet.out(Tensor self, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet) + variants: function + +- func: logdet(Tensor self) -> Tensor + variants: function, method + +- func: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors) + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_eig + +- func: linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) + python_module: linalg + dispatch: + CPU, CUDA: linalg_eig_out + +- func: linalg_eigvals(Tensor self) -> Tensor + python_module: linalg + +- func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +# This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and +# `linalg.eigvalsh` as composite functions that call this one +- func: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors) + structured_delegate: _linalg_eigh.eigenvalues + +- func: _linalg_eigh.eigenvalues(Tensor A, str UPLO="L", bool compute_v=True, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) + structured: True + dispatch: + CPU, CUDA: _linalg_eigh_out + +- func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) + python_module: linalg + +- func: linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) + python_module: linalg + +- func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor + python_module: linalg + +- func: linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_householder_product(Tensor input, Tensor tau) -> Tensor + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_householder_product + +- func: linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + dispatch: + CPU, CUDA: linalg_householder_product_out + +- func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info) + python_module: linalg + structured_delegate: linalg_inv_ex.inverse + +- func: linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_inv_ex_out + MPS: linalg_inv_ex_out_mps + +- func: linalg_inv(Tensor A) -> Tensor + python_module: linalg + +- func: linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: inverse(Tensor self) -> Tensor + variants: function, method + +- func: inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: inner(Tensor self, Tensor other) -> Tensor + variants: function, method + +- func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: outer(Tensor self, Tensor vec2) -> Tensor + variants: function, method + +- func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) + +# torch.ger, alias for torch.outer +- func: ger(Tensor self, Tensor vec2) -> Tensor + variants: function, method + +- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) + +- func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + structured_delegate: linalg_vector_norm.out + +- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_vector_norm_out + MPS: linalg_vector_norm_out_mps + +- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + +- func: linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_matrix_norm.str_ord(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + +- func: linalg_matrix_norm.str_ord_out(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +# This function is exposes the `compute_uv` flag, which is then used to implement `linalg.svd` and +# `linalg.svdvals` as composite functions that call this one +- func: _linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh) + variants: function + structured_delegate: _linalg_svd.U + +- func: _linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) + structured: True + dispatch: + CPU, CUDA: _linalg_svd_out + +- func: linalg_svd(Tensor A, bool full_matrices=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh) + python_module: linalg + variants: function + +- func: linalg_svd.U(Tensor A, bool full_matrices=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) + python_module: linalg + variants: function + +- func: linalg_svdvals(Tensor A, *, str? driver=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_svdvals.out(Tensor A, *, str? driver=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_cond.p_str(Tensor self, str p) -> Tensor + python_module: linalg + variants: function + +- func: linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + dispatch: + # calls svd, which calls mH() (view op) + # also calls narrow() + CompositeExplicitAutogradNonFunctional: linalg_pinv + +- func: linalg_pinv.atol_rtol_tensor_out(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + dispatch: + CompositeExplicitAutograd: linalg_pinv_out + +- func: linalg_pinv.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_pinv.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_pinv(Tensor self, float rcond, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_pinv.rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_pinv.out(Tensor self, float rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: _linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info) + structured_delegate: _linalg_solve_ex.result + +- func: _linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) + structured: True + dispatch: + CPU, CUDA: _linalg_solve_ex_out + +- func: linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info) + python_module: linalg + +- func: linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info) + python_module: linalg + +- func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor + python_module: linalg + +- func: linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_tensorinv(Tensor self, int ind=2) -> Tensor + python_module: linalg + variants: function + +- func: linalg_tensorinv.out(Tensor self, int ind=2, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R) + python_module: linalg + variants: function + structured_delegate: linalg_qr.out + +- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) + python_module: linalg + structured: True + dispatch: + CPU, CUDA: linalg_qr_out + +- func: linalg_matrix_power(Tensor self, int n) -> Tensor + python_module: linalg + +- func: linalg_matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +- func: linalg_matrix_rank.atol_rtol_tensor(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matrix_rank.atol_rtol_tensor_out(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_matrix_rank.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_matrix_rank.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + cpp_no_default_args: ['atol', 'rtol'] + python_module: linalg + variants: function + +- func: linalg_matrix_rank(Tensor self, float tol, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matrix_rank.out(Tensor self, float tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_matrix_rank.tol_tensor(Tensor input, Tensor tol, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matrix_rank.out_tol_tensor(Tensor input, Tensor tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_multi_dot(Tensor[] tensors) -> Tensor + python_module: linalg + +- func: linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + +## Functions related to the `torch.nested` namespace +# Note [nested namespace binding] +# Functions in the nested python module should have their names start with +# "nested_" underscore and be bound to the desired Python name in +# torch/nested/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/nested.h. +# The "nested_" names should be hidden from the user and not documented. + +- func: nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor + python_module: nested + variants: function + +## Functions that are only for testing +# It is undocumented and should not be used outside of tests. +- func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor + +# Note: this function is only for testing. +- func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor + python_module: nn + dispatch: + CPU: _test_optional_intlist + autogen: _test_optional_intlist.out + +# Note: this function is only for testing. +- func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor + python_module: nn + dispatch: + CPU: _test_optional_intlist + autogen: _test_optional_filled_intlist.out + +# Note: this function is only for testing. +- func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor + python_module: nn + dispatch: + CPU: _test_optional_floatlist + autogen: _test_optional_floatlist.out + +# Note: this function is only for testing. +- func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor + python_module: nn + +# Note: this function is only for testing. +- func: _test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor + python_module: nn + +# Note: this function is only for testing. +- func: _test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor + cpp_no_default_args: ['a', 'b'] + python_module: nn + +# Note: this function is only for testing. +- func: _test_warn_in_autograd(Tensor self) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: _test_warn_in_autograd + autogen: _test_warn_in_autograd.out + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor + dispatch: + # the NestedTensor keys are necessary because NestedTensor has been removed + # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys] + CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage + autogen: _test_autograd_multiple_dispatch.fullcoverage_out + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor + dispatch: + CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a) + dispatch: + CompositeExplicitAutograd: _test_autograd_multiple_dispatch_view + +# Note: this function is only for testing. +- func: _test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _test_autograd_multiple_dispatch_view_copy + tags: view_copy + autogen: _test_autograd_multiple_dispatch_view_copy.out + +- func: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor + variants: function + dispatch: + CPU, CUDA: segment_reduce_kernel + autogen: segment_reduce.out + +- func: _segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None) -> Tensor + variants: function + dispatch: + CPU, CUDA: _segment_reduce_backward_kernel + autogen: _segment_reduce_backward.out + +- func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor + python_module: nn + variants: function + +- func: flatten_dense_tensors(Tensor[] tensors) -> Tensor + variants: function + python_module: nn + +- func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[] + variants: function + python_module: nn + +- func: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _nested_tensor_from_tensor_list + autogen: _nested_tensor_from_tensor_list.out + +- func: _fw_primal_copy(Tensor self, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _fw_primal_copy + tags: view_copy + autogen: _fw_primal_copy.out + +- func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _make_dual_copy + tags: view_copy + autogen: _make_dual_copy.out + +- func: view_as_real_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_as_real_copy + tags: view_copy + autogen: view_as_real_copy.out + +- func: view_as_complex_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_as_complex_copy + tags: view_copy + autogen: view_as_complex_copy.out + +- func: _conj_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _conj_copy + tags: view_copy + autogen: _conj_copy.out + +- func: _neg_view_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _neg_view_copy + tags: view_copy + autogen: _neg_view_copy.out + +- func: as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: as_strided_copy_symint + tags: view_copy + autogen: as_strided_copy.out + +- func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _sparse_broadcast_to_copy + tags: view_copy + autogen: _sparse_broadcast_to_copy.out + +- func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: diagonal_copy + tags: view_copy + autogen: diagonal_copy.out + +- func: expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: expand_copy_symint + tags: view_copy + autogen: expand_copy.out + +- func: permute_copy(Tensor self, int[] dims) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: permute_copy + tags: view_copy + autogen: permute_copy.out + +- func: _reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint + tags: view_copy + autogen: _reshape_alias_copy.out + +- func: select_copy.int(Tensor self, int dim, SymInt index) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: select_copy_symint + SparseCsrCPU, SparseCsrCUDA: select_copy_sparse_csr + tags: view_copy + autogen: select_copy.int_out + +- func: detach_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: detach_copy + tags: view_copy + autogen: detach_copy.out + +- func: slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: slice_copy_Tensor_symint + tags: view_copy + autogen: slice_copy.Tensor_out + +- func: split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: split_copy_Tensor_symint + tags: view_copy + +- func: split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: split_with_sizes_copy_symint + tags: view_copy + +- func: squeeze_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: squeeze_copy + tags: view_copy + autogen: squeeze_copy.out + +- func: squeeze_copy.dim(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: squeeze_copy_dim + tags: view_copy + autogen: squeeze_copy.dim_out + +- func: squeeze_copy.dims(Tensor self, int[] dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: squeeze_copy_dims + tags: view_copy + autogen: squeeze_copy.dims_out + +- func: t_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: t_copy + tags: view_copy + autogen: t_copy.out + +- func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: transpose_copy_int + tags: view_copy + autogen: transpose_copy.int_out + +- func: unsqueeze_copy(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: unsqueeze_copy + tags: view_copy + autogen: unsqueeze_copy.out + +- func: _indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _indices_copy + tags: view_copy + autogen: _indices_copy.out + +- func: _values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: _values_copy + tags: view_copy + autogen: _values_copy.out + +- func: indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: indices_copy + tags: view_copy + autogen: indices_copy.out + +- func: values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: values_copy + tags: view_copy + autogen: values_copy.out + +- func: crow_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: crow_indices_copy + tags: view_copy + autogen: crow_indices_copy.out + +- func: col_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: col_indices_copy + tags: view_copy + autogen: col_indices_copy.out + +- func: ccol_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: ccol_indices_copy + tags: view_copy + autogen: ccol_indices_copy.out + +- func: row_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: row_indices_copy + tags: view_copy + autogen: row_indices_copy.out + +- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: unbind_copy_int + tags: view_copy + +- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: unbind_copy_int_out + +- func: split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_copy_Tensor_out + + +- func: split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_with_sizes_copy_out + +- func: view_copy(Tensor self, SymInt[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_copy_symint + tags: view_copy + autogen: view_copy.out + +- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: view_copy_dtype + tags: view_copy + autogen: view_copy.dtype_out + +- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: unfold_copy + tags: view_copy + autogen: unfold_copy.out + +- func: alias_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: alias_copy + tags: view_copy + autogen: alias_copy.out + +- func: to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor + variants: method + dispatch: + NestedTensorCPU: NestedTensor_to_padded_tensor_generic + NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda + autogen: to_padded_tensor.out + +- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor + dispatch: + NestedTensorCPU: NestedTensor_softmax_dropout + NestedTensorCUDA: NestedTensor_softmax_dropout_cuda + tags: nondeterministic_seeded + +# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is. +- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor + variants: function + dispatch: + CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward + autogen: _transformer_encoder_layer_fwd.out + +- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor) + variants: function + dispatch: + CPU, NestedTensorCPU: native_multi_head_attention_cpu + CUDA, NestedTensorCUDA: native_multi_head_attention_cuda + autogen: _native_multi_head_attention.out + +- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor + python_module: nn + variants: function + autogen: scaled_dot_product_attention.out + tags: nondeterministic_seeded + +# TODO: THIS NEEDS TO BE REMOVED BUT PEOPLE HAVE TRAINED THEIR MODELS WITH THIS OP BUILTIN +- func: _scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor) + python_module: nn + variants: function + autogen: _scaled_dot_product_attention.out + tags: nondeterministic_seeded + +# This aten function is kept so that we can test the choice function from Python +- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int + dispatch: + Meta: _fused_sdp_choice_meta + CPU, NestedTensorCPU: _fused_sdp_choice_cpp + CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor) + variants: function + tags: nondeterministic_seeded + +- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) + dispatch: + CUDA: _scaled_dot_product_flash_attention_cuda + NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) + device_check: NoCheck + variants: function + dispatch: + CUDA: _scaled_dot_product_flash_attention_backward_cuda + +- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp) + dispatch: + CUDA: _scaled_dot_product_efficient_attention_cuda + NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda + +- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False, *, float? scale=None) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _scaled_dot_product_efficient_attention_backward_cuda + +# THIS FUNCTION iS DEPRECATED AND SHOULD BE REMOVED +- func: _chunk_grad_outputs_efficient_attention(Tensor query, Tensor key, Tensor value, bool is_causal=False) -> bool + dispatch: + CUDA: _chunk_grad_outputs_efficient_attention + +- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) + variants: function + dispatch: + CUDA: _flash_attention_forward + tags: nondeterministic_seeded + +- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) + device_check: NoCheck + variants: function + dispatch: + CUDA: _flash_attention_backward + +# Returns ouput, logsumexp if compute_logsumexp +- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp) + variants: function + dispatch: + CUDA: _efficient_attention_forward + tags: nondeterministic_seeded + +- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor rng_seed, Tensor rng_offset, int custom_mask_type, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor) + device_check: NoCheck + variants: function + dispatch: + CUDA: _efficient_attention_backward + +- func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor + variants: function + dispatch: + CUDA: triton_scaled_dot_attention + autogen: _triton_scaled_dot_attention.out + tags: nondeterministic_seeded + +- func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor + variants: function + dispatch: + CUDA: triton_multi_head_attention + autogen: _triton_multi_head_attention.out + +- func: special_airy_ai(Tensor x) -> Tensor + python_module: special + structured_delegate: special_airy_ai.out + variants: function + tags: pointwise + +- func: special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_airy_ai_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_j0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_j0.out + variants: function + tags: pointwise + +- func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_j0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_j1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_j1.out + variants: function + tags: pointwise + +- func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_j1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_y0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_y0.out + variants: function + tags: pointwise + +- func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_y0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_bessel_y1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_bessel_y1.out + variants: function + tags: pointwise + +- func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_bessel_y1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_t.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_t_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_u.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_u_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_v.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_v_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_chebyshev_polynomial_w.out + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_chebyshev_polynomial_w_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_hermite_polynomial_h.out + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_hermite_polynomial_h_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_hermite_polynomial_he.out + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_hermite_polynomial_he_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_laguerre_polynomial_l.out + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_laguerre_polynomial_l_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_legendre_polynomial_p.out + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_legendre_polynomial_p_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_modified_bessel_i0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_i0.out + variants: function + tags: pointwise + +- func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_i0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_modified_bessel_i1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_i1.out + variants: function + tags: pointwise + +- func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_i1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_modified_bessel_k0(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_k0.out + variants: function + tags: pointwise + +- func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_k0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_modified_bessel_k1(Tensor self) -> Tensor + python_module: special + structured_delegate: special_modified_bessel_k1.out + variants: function + tags: pointwise + +- func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_modified_bessel_k1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k0(Tensor x) -> Tensor + python_module: special + structured_delegate: special_scaled_modified_bessel_k0.out + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_scaled_modified_bessel_k0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k1(Tensor x) -> Tensor + python_module: special + structured_delegate: special_scaled_modified_bessel_k1.out + variants: function + tags: pointwise + +- func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_scaled_modified_bessel_k1_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_t.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_t_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_u.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_u_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_v.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_v_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + structured_delegate: special_shifted_chebyshev_polynomial_w.out + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + dispatch: + CPU, CUDA: special_shifted_chebyshev_polynomial_w_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out + device_check: NoCheck + python_module: special + variants: function + tags: pointwise + +- func: special_spherical_bessel_j0(Tensor x) -> Tensor + python_module: special + structured_delegate: special_spherical_bessel_j0.out + variants: function + tags: pointwise + +- func: special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: special_spherical_bessel_j0_out + python_module: special + structured_inherits: TensorIteratorBase + structured: True + variants: function + tags: pointwise + +# Aux function used in the test TestPythonDispatch.test_kwarg_only_and_positional_default +# within test/test_python_dispatch.py +- func: _foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor + dispatch: + CPU: foobar + autogen: _foobar.out + +# Fused Optimizer CUDA kernels. +- func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + variants: function + dispatch: + CUDA: _fused_adam_kernel_cuda_ + autogen: _fused_adam, _fused_adam.out + +- func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + variants: function + dispatch: + CUDA: _fused_adamw_kernel_cuda_ + autogen: _fused_adamw, _fused_adamw.out + +# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts. +- func: _propagate_xla_data(Tensor input, Tensor output) -> () + variants: function diff --git a/torchgen/packaged/ATen/native/tags.yaml b/torchgen/packaged/ATen/native/tags.yaml new file mode 100644 index 00000000000..9f7fb7fec59 --- /dev/null +++ b/torchgen/packaged/ATen/native/tags.yaml @@ -0,0 +1,50 @@ +# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml` + +- tag: inplace_view + desc: | + This tag indicates if an operator *only* modifies the tensor metadata +- tag: view_copy + desc: | + This tag indicates operators that are *_copy* variants + of view/aliasing operators. If an operator has a view_copy tag, + then it should have the name {op}_copy, where {op} is a view operator. +- tag: dynamic_output_shape + desc: | + This tag indicates if an operator's output's shape depends on input Tensor + data. +- tag: data_dependent_output + desc: | + Operator has a non-Tensor output whose value is dependent on the data + of Tensor inputs. Among other things, this implies that this operator + cannot be run with meta tensor (since data is not available), nor + can it be symbolically traced. +- tag: generated + desc: | + This tag indicates that the operator doesn't have an explicit entry in + native_functions.yaml, and instead was generated automatically by the codegen. +- tag: nondeterministic_seeded + desc: | + This tag indicates if an operator is nondeterministically seeded + (i.e., is random) such that the operator intentionally produces + different results when run twice on the same inputs, but this randomness + is controlled by a Generator which, if reseeded would give you the + same result. +- tag: nondeterministic_bitwise + desc: | + This tag indicates if an operator doesn't guarantee bitwise equivalence + across different runs of an operator with identical inputs. +- tag: core + desc: | + Core aten ops is a subset of aten ops that remains after aten-to-aten decomposition and + functionalization pass. Core aten ops are fully functional and adhere to single static + assignment (SSA): this implies there will be no `inplace` or `_out` variants in this opset. + This opset is designed to serve as the functional IR to interface with compiler backends. + In contrast to primTorch, core aten opset doesn't decompose ops into explicit + type promotion and broadcasting ops. + Core aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True), + and thus can be used as an opset for export purpose. +- tag: pointwise + desc: | + Pointwise operators are operators where each element of the output is computed only by accessing + the corresponding element of all the broadcasted inputs. The output shape will be the broadcasted + shape of the inputs. diff --git a/torchgen/packaged/ATen/templates/ATenOpList.cpp b/torchgen/packaged/ATen/templates/ATenOpList.cpp new file mode 100644 index 00000000000..5de3424857e --- /dev/null +++ b/torchgen/packaged/ATen/templates/ATenOpList.cpp @@ -0,0 +1,36 @@ +#include + +#include +#include +#include +#include +#include + +// ${generated_comment} + +namespace at { + +namespace { +struct OpNameEquals final { + bool operator()(const std::pair& lhs, const std::pair& rhs) const { + return 0 == strcmp(lhs.first, rhs.first) && 0 == strcmp(lhs.second, rhs.second); + } +}; + +struct OpNameHash final { + size_t operator()(const std::pair& p) const { + // use std::hash because std::hash would hash pointers and not pointed-to strings + return std::hash()(p.first) ^ (~ std::hash()(p.second)); + } +}; +} + +bool is_custom_op(const c10::OperatorName& opName) { + static std::unordered_set, OpNameHash, OpNameEquals> ops { + ${aten_ops} + {"", ""} + }; + return ops.count(std::make_pair( + opName.name.c_str(), opName.overload_name.c_str())) == 0; +} +} diff --git a/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp b/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp new file mode 100644 index 00000000000..47097d7aa43 --- /dev/null +++ b/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp @@ -0,0 +1,73 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +// ${generated_comment} + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +$ops_headers +#endif + +namespace at { +namespace native { + +// This file contains a number of kernels for aten functions that are fully code-generated. +// TODO: rename this file to something more generic. + +namespace { +at::Tensor clone_arg(const at::Tensor& t) { + return t.clone(); +} + +std::vector clone_arg(const at::TensorList& t_list) { + std::vector out(t_list.size()); + for (const auto& i : c10::irange(t_list.size())) { + out[i] = t_list[i].clone(); + } + return out; +} + +// duped with gen_resize_out_helper from structured kernels +void copy_arg(const at::Tensor& dst, const at::Tensor& src) { + TORCH_CHECK(src.dtype() == dst.dtype(), + "Expected out tensor to have dtype ", src.dtype(), ", but got ", dst.dtype(), " instead"); + TORCH_CHECK(src.device() == dst.device(), + "Expected out tensor to have device ", src.device(), ", but got ", dst.device(), " instead"); + dst.copy_(src); +} + +void copy_arg(const at::TensorList& dst, const at::TensorList& src) { + TORCH_INTERNAL_ASSERT(dst.size() == src.size()); + for (const auto& i : c10::irange(dst.size())) { + copy_arg(dst[i], src[i]); + } +} + +// TODO: this doesn't handle restriding empty tensors correctly; see +// gen_resize_out_helper for the correct algorithm + +void resize_out_helper(const at::Tensor& dst, const at::Tensor& src) { + at::native::resize_output(dst, src.sizes()); +} + +void resize_out_helper(const at::TensorList& dst, const at::TensorList& src) { + TORCH_INTERNAL_ASSERT(dst.size() == src.size()); + for (const auto& i : c10::irange(dst.size())) { + at::native::resize_output(dst[i], src[i].sizes()); + } +} +} + + +${CompositeViewCopyKernel_Definitions} + +${GeneratedCompositeFunctional_Definitions} + +${GeneratedCompositeOut_Definitions} + +} // namespace native +} // namespace at diff --git a/torchgen/packaged/ATen/templates/DispatchKeyFunction.h b/torchgen/packaged/ATen/templates/DispatchKeyFunction.h new file mode 100644 index 00000000000..c92d5eb3898 --- /dev/null +++ b/torchgen/packaged/ATen/templates/DispatchKeyFunction.h @@ -0,0 +1,23 @@ +#pragma once +// ${generated_comment} + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace ${dispatch_namespace} { + +${dispatch_namespaced_declarations} + +} // namespace ${dispatch_namespace} +} // namespace at diff --git a/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h b/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h new file mode 100644 index 00000000000..ffae7131913 --- /dev/null +++ b/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h @@ -0,0 +1,29 @@ +#include + +// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch] +// Code introduced to avoid cyclic dependency in static dispatch is no longer +// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place, +// to Operators.cpp for supporting multiple backends with multiple kernels. +// +// Note [Avoiding Include Cycles In Static Dispatch] +// In order to avoid #include cycles in the static dispatch build, we've carefully split out +// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h. +// +// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h. +// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods +// all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all +// directly inlined into TensorBody.h. +// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API, +// which include functions that have defaultable optional arguments. +// That requires knowing the full Tensor class definition. +// +// We break the cycle by doing the following: +// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h +// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl., +// - CPUFunctions_inl.h includes everything else +// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class, +// and then it includes CPUFunctions_inl.h. +// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly. +// - This also means that static dispatch build, CPUFunctions.h only needs to +// #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h. +${inline_headers} diff --git a/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h b/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h new file mode 100644 index 00000000000..fbb71c2cb12 --- /dev/null +++ b/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h @@ -0,0 +1,22 @@ +#pragma once +// ${generated_comment} + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from \ + . \ + See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +${DispatchKeyFunctions_inl_includes} + + +${dispatch_namespaced_declarations} diff --git a/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp b/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp new file mode 100644 index 00000000000..7647f459a74 --- /dev/null +++ b/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp @@ -0,0 +1,13 @@ +// ${generated_comment} +${includes} +${native_functions_include} + +namespace { +${helper_fns} +} // namespace + +${namespace_prologue} + +${native_function_definitions} + +${namespace_epilogue} diff --git a/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h b/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h new file mode 100644 index 00000000000..b45a17b5922 --- /dev/null +++ b/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h @@ -0,0 +1,19 @@ +#pragma once + +// an external backend might generate file within its code tree +// and check all the source files within the tree with clang-format. +// so, disable it since the backend might have a different config. +// clang-format off + +// ${generated_comment} + +#include + +${namespace_prologue} + +struct ${class_name} { + +${dispatch_declarations} + +}; +${namespace_epilogue} diff --git a/torchgen/packaged/ATen/templates/Function.h b/torchgen/packaged/ATen/templates/Function.h new file mode 100644 index 00000000000..5bbd742aae0 --- /dev/null +++ b/torchgen/packaged/ATen/templates/Function.h @@ -0,0 +1,26 @@ +#pragma once + +// ${generated_comment} + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +${static_dispatch_ops_headers} + +${operator_includes} + +namespace at { + +${function_definitions} + +} diff --git a/torchgen/packaged/ATen/templates/FunctionalInverses.h b/torchgen/packaged/ATen/templates/FunctionalInverses.h new file mode 100644 index 00000000000..eea76eeecb1 --- /dev/null +++ b/torchgen/packaged/ATen/templates/FunctionalInverses.h @@ -0,0 +1,16 @@ +#pragma once + +// ${generated_comment} + +#include + +namespace at { +namespace functionalization { + +struct FunctionalInverses { + +${view_inverse_declarations} + +}; +} +} diff --git a/torchgen/packaged/ATen/templates/Functions.cpp b/torchgen/packaged/ATen/templates/Functions.cpp new file mode 100644 index 00000000000..3b7374ef1f0 --- /dev/null +++ b/torchgen/packaged/ATen/templates/Functions.cpp @@ -0,0 +1,101 @@ +#include + +#include +#include + +namespace at { + +Tensor TensorMaker::make_tensor() { + AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove. + tracer::impl::NoTracerDispatchMode tracer_guard{}; + + check_size_nonnegative(sizes_); + + TORCH_CHECK_VALUE( + !deleter_ || !ctx_, + "The deleter and context arguments are mutually exclusive."); + + if (device_ == nullopt) { + device_ = globalContext().getDeviceFromPtr(data_, opts_.device().type()); + } + + if (opts_.device().has_index()) { + // clang-format off + TORCH_CHECK_VALUE( + opts_.device() == *device_, + "Specified device ", opts_.device(), " does not match device of data ", *device_); + // clang-format on + } + + std::size_t size_bytes = computeStorageSize(); + + DataPtr data_ptr{}; + if (deleter_) { + data_ptr = makeDataPtrFromDeleter(); + } else { + data_ptr = makeDataPtrFromContext(); + } + + Storage storage{Storage::use_byte_size_t{}, size_bytes, std::move(data_ptr)}; + + Tensor tensor = detail::make_tensor( + std::move(storage), opts_.computeDispatchKey(), opts_.dtype()); + + TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl(); + if (strides_) { + tensor_impl->set_sizes_and_strides(sizes_, *strides_); + } else { + tensor_impl->set_sizes_contiguous(sizes_); + } + if (storage_offset_) { + tensor_impl->set_storage_offset(*storage_offset_); + } + + return tensor; + } + + std::size_t TensorMaker::computeStorageSize() const noexcept { + std::size_t itemsize = opts_.dtype().itemsize(); + + if (strides_) { + auto storage_size = detail::computeStorageNbytes(sizes_, *strides_, itemsize); + if (storage_offset_) { + storage_size += storage_offset_.value(); + } + return storage_size; + } + + std::size_t size = 1; + for (std::int64_t s : sizes_) { + size *= static_cast(s); + } + auto storage_size = size * itemsize; + if (storage_offset_) { + storage_size += storage_offset_.value(); + } + return storage_size; + } + + inline DataPtr TensorMaker::makeDataPtrFromDeleter() const { + return InefficientStdFunctionContext::makeDataPtr(data_, deleter_, *device_); + } + + inline DataPtr TensorMaker::makeDataPtrFromContext() noexcept { + return DataPtr{data_, ctx_.release(), ctx_.get_deleter(), *device_}; + } + + IntArrayRef TensorMaker::makeTempSizes() const noexcept { + static std::int64_t zeros[5] = {0, 0, 0, 0, 0}; + if (opts_.has_memory_format()) { + MemoryFormat format = *opts_.memory_format_opt(); + if (format == MemoryFormat::ChannelsLast) { + return IntArrayRef(zeros, 4); + } + if (format == MemoryFormat::ChannelsLast3d) { + return IntArrayRef(zeros, 5); + } + } + return IntArrayRef(zeros, 1); + } + +} // namespace at diff --git a/torchgen/packaged/ATen/templates/Functions.h b/torchgen/packaged/ATen/templates/Functions.h new file mode 100644 index 00000000000..fb531363f53 --- /dev/null +++ b/torchgen/packaged/ATen/templates/Functions.h @@ -0,0 +1,143 @@ +#pragma once + +// ${generated_comment} + +#ifdef TORCH_ASSERT_NO_OPERATORS +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if your change would be better placed in \ + another file, or if a more specific header might achieve the same goal. \ + See NOTE: [Tensor vs. TensorBase] +#endif + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from and \ + see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +// NOTE: [TORCH_ASSERT_ONLY_METHOD_OPERATORS] +// +// In ATen, certain generated headers files include the definitions of +// every single operator in PyTorch. Unfortunately this means every +// time an operator signature is updated or changed in +// native_functions.yaml, you (and every other PyTorch developer) need +// to recompile every source file that includes any of these headers. +// +// To break up these header dependencies, and improve incremental +// build times for all PyTorch developers. These headers are split +// into per-operator headers in the `ATen/ops` folder. This limits +// incremental builds to only changes to methods of `Tensor`, or files +// that use the specific operator being changed. With `at::sum` as an +// example, you should include +// +// // instead of ATen/Functions.h +// // instead of ATen/NativeFunctions.h +// // instead of ATen/Operators.h +// // instead of ATen/CPUFunctions.h +// +// However, even if you're careful to use this in your own code. +// `Functions.h` might be included indirectly through another header +// without you realising. To avoid this, you can add +// +// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS +// +// to the top of your source file. This way any time the non-specific +// headers are included, the compiler will error out. +// +// Also, be aware that `ops` are not available in all build +// configurations (namely fb-internal) so you must guard these +// includes with `#ifdef AT_PER_OPERATOR_HEADERS`. e.g. +// +// #ifndef AT_PER_OPERATOR_HEADERS +// #include +// #else +// #include +// #endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +${Functions_includes} + +namespace at { + +${Functions_declarations} + +// Special C++ only overloads for std()-like functions (See gh-40287) +// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef +// So, for example std(0) would select the std(unbiased=False) overload +TORCH_API inline Tensor var(const Tensor& self, int dim) { + return at::var(self, IntArrayRef{dim}); +} +TORCH_API inline std::tuple var_mean(const Tensor& self, int dim) { + return at::var_mean(self, IntArrayRef{dim}); +} +TORCH_API inline Tensor std(const Tensor& self, int dim) { + return at::std(self, IntArrayRef{dim}); +} +TORCH_API inline std::tuple std_mean(const Tensor& self, int dim) { + return at::std_mean(self, IntArrayRef{dim}); +} + +inline int64_t numel(const Tensor& tensor) { + return tensor.numel(); +} + +inline int64_t size(const Tensor& tensor, int64_t dim) { + return tensor.size(dim); +} + +inline int64_t stride(const Tensor& tensor, int64_t dim) { + return tensor.stride(dim); +} + +inline bool is_complex(const Tensor& tensor) { + return tensor.is_complex(); +} + +inline bool is_floating_point(const Tensor& tensor) { + return tensor.is_floating_point(); +} + +inline bool is_signed(const Tensor& tensor) { + return tensor.is_signed(); +} + +inline bool is_inference(const Tensor& tensor) { + return tensor.is_inference(); +} + +inline bool _is_zerotensor(const Tensor& tensor) { + return tensor._is_zerotensor(); +} + +inline bool is_conj(const Tensor& tensor) { + return tensor.is_conj(); +} + +inline Tensor conj(const Tensor& tensor) { + return tensor.conj(); +} + +inline bool is_neg(const Tensor& tensor) { + return tensor.is_neg(); +} + +} diff --git a/torchgen/packaged/ATen/templates/LazyIr.h b/torchgen/packaged/ATen/templates/LazyIr.h new file mode 100644 index 00000000000..1ee90e66cc6 --- /dev/null +++ b/torchgen/packaged/ATen/templates/LazyIr.h @@ -0,0 +1,19 @@ +#pragma once + +// This file contains autogenerated LazyTensor IR nodes +${lazy_ir_sysinc} +${lazy_ir_inc} + +${namespace_prologue} +using at::operator<<; + +// kNullValue is used to contribute a static hash value any time +// a node has an Optional input that is nullopt. It is important +// to differentiate between HASH(nullopt, something) and HASH(something, nullopt), +// and using kNullValue in the hash function in the order of arguments +// serves this purpose. +static const torch::lazy::Value kNullValue = torch::lazy::Value(); + +${ir_declarations} + +${namespace_epilogue} diff --git a/torchgen/packaged/ATen/templates/LazyNonNativeIr.h b/torchgen/packaged/ATen/templates/LazyNonNativeIr.h new file mode 100644 index 00000000000..18eaf6da52e --- /dev/null +++ b/torchgen/packaged/ATen/templates/LazyNonNativeIr.h @@ -0,0 +1,11 @@ +#pragma once + +${lazy_non_native_ir_inc} + +// This file contains autogenerated LazyTensor Non Native IR nodes + +${namespace_prologue} + +${non_native_ir_nodes} + +${namespace_epilogue} diff --git a/torchgen/packaged/ATen/templates/MethodOperators.h b/torchgen/packaged/ATen/templates/MethodOperators.h new file mode 100644 index 00000000000..0e192cd05ef --- /dev/null +++ b/torchgen/packaged/ATen/templates/MethodOperators.h @@ -0,0 +1,24 @@ +#pragma once + +// ${generated_comment} + +#ifdef TORCH_ASSERT_NO_OPERATORS +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if your change would be better placed in \ + another file, or if a more specific header might achieve the same goal. \ + See NOTE: [Tensor vs. TensorBase] +#endif + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +${MethodOperators_includes} + +namespace at { +namespace _ops { +${MethodOperators_declarations} +} // namespace _ops +} // namespace at diff --git a/torchgen/packaged/ATen/templates/NativeFunction.h b/torchgen/packaged/ATen/templates/NativeFunction.h new file mode 100644 index 00000000000..4f70db62a4c --- /dev/null +++ b/torchgen/packaged/ATen/templates/NativeFunction.h @@ -0,0 +1,17 @@ +#pragma once + +// ${generated_comment} + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +${extra_includes} + +${native_function_declarations} diff --git a/torchgen/packaged/ATen/templates/NativeFunctions.h b/torchgen/packaged/ATen/templates/NativeFunctions.h new file mode 100644 index 00000000000..d6d7205b579 --- /dev/null +++ b/torchgen/packaged/ATen/templates/NativeFunctions.h @@ -0,0 +1,33 @@ +#pragma once + +// ${generated_comment} + +#ifdef TORCH_ASSERT_NO_OPERATORS +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if your change would be better placed in \ + another file, or if a more specific header might achieve the same goal. \ + See NOTE: [Tensor vs. TensorBase] +#endif + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from \ + and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +${NativeFunctions_includes} + +${NativeFunctions_declarations} diff --git a/torchgen/packaged/ATen/templates/NativeMetaFunction.h b/torchgen/packaged/ATen/templates/NativeMetaFunction.h new file mode 100644 index 00000000000..d660becdd9e --- /dev/null +++ b/torchgen/packaged/ATen/templates/NativeMetaFunction.h @@ -0,0 +1,23 @@ +#pragma once + +// ${generated_comment} + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +${meta_function_declarations} + +} // namespace native +} // namespace at diff --git a/torchgen/packaged/ATen/templates/NativeMetaFunctions.h b/torchgen/packaged/ATen/templates/NativeMetaFunctions.h new file mode 100644 index 00000000000..89989e2121c --- /dev/null +++ b/torchgen/packaged/ATen/templates/NativeMetaFunctions.h @@ -0,0 +1,19 @@ +#pragma once + +// ${generated_comment} + +#include +#include +#include +#include + +${NativeMetaFunctions_includes} + +namespace at { + +namespace meta { + +${NativeMetaFunctions_declarations} + +} // namespace meta +} // namespace at diff --git a/torchgen/packaged/ATen/templates/Operator.h b/torchgen/packaged/ATen/templates/Operator.h new file mode 100644 index 00000000000..8b3989b66de --- /dev/null +++ b/torchgen/packaged/ATen/templates/Operator.h @@ -0,0 +1,18 @@ +#pragma once + +// ${generated_comment} + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + +${declarations} + +}} // namespace at::_ops diff --git a/torchgen/packaged/ATen/templates/Operators.cpp b/torchgen/packaged/ATen/templates/Operators.cpp new file mode 100644 index 00000000000..082bb67c3e2 --- /dev/null +++ b/torchgen/packaged/ATen/templates/Operators.cpp @@ -0,0 +1,19 @@ +#include +#include + +// ${generated_comment} +// NOTE See [Sharded File] comment in VariableType + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +${operator_headers} +#endif + +${static_dispatch_extra_headers} + +namespace at { namespace _ops { + +${definitions} + +}} // namespace at::_ops diff --git a/torchgen/packaged/ATen/templates/Operators.h b/torchgen/packaged/ATen/templates/Operators.h new file mode 100644 index 00000000000..e74b96ef3d5 --- /dev/null +++ b/torchgen/packaged/ATen/templates/Operators.h @@ -0,0 +1,74 @@ +#pragma once + +// ${generated_comment} + +#ifdef TORCH_ASSERT_NO_OPERATORS +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if your change would be better placed in \ + another file, or if a more specific header might achieve the same goal. \ + See NOTE: [Tensor vs. TensorBase] +#endif + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from \ + and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +${Operators_includes} + +// Extension writers: do you write wrapper functions? Are you frustrated with +// resolving overloads of operators? Are you frustrated with dealing with +// pointer-to-methods and resolving overloads of pointer-to-methods?? Look no +// further, this is the utility for you. +// +// Given an operator schema: aten::op.overload(... +// +// Use ATEN_FN2(op, overload) to get a *function* version of the operator +// that is guaranteed to not be overloaded. This means that you can safely +// decltype(&ATEN_FN2(op, overload)) it. NB: the 2 means this macro takes 2 args. +// +// Given an operator schema without an overload name: aten::op(... +// +// Use ATEN_FN(op) to get an unambiguous *function* version of the operator. +// +// There is some interesting behavior for out= operations. +// ATEN_FN2(sin, out) gives a function that is *faithful* to the schema; +// that is, the order of arguments is exactly what it looks like in the schema. + +#define ATEN_FN2(op_name, overload) at::_ops::op_name##_##overload::call +#define ATEN_FN(op_name) at::_ops::op_name::call + +// Separately, ATEN_OP(op) and ATEN_OP2(op, overload) define a class containing compile-time +// metadata about a given aten operator. +// Notable data on the class includes: +// - ATEN_OP2(add, Tensor)::name // returns the string name: "add" +// - ATEN_OP2(add, Tensor)::overload_name // returns the string overload name: "Tensor" +// - ATEN_OP2(add, Tensor)::schema // returns the C++ schema type: at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &) +// - ATEN_OP2(add, Tensor)::schema_str // returns the string jit type: "add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor" + +#define ATEN_OP2(op_name, overload) at::_ops::op_name##_##overload +#define ATEN_OP(op_name) at::_ops::op_name + +// WARNING: Please do not call any of the ops in the _ops namespace directly. +// Use the ATEN_FN macros. We do not guarantee stability of the naming +// scheme for the functions in at::_ops + +// See Note [The ATen Operators API] for details of the at::_ops namespace + +namespace at { +namespace _ops { +${Operators_declarations} +} // namespace _ops +} // namespace at diff --git a/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp b/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp new file mode 100644 index 00000000000..58102bd97fc --- /dev/null +++ b/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp @@ -0,0 +1,15 @@ +// ${generated_comment} + +#include +#include + +#include +#include + +namespace at { + +namespace redispatch { + ${function_redispatch_definitions} +} // namespace redispatch + +} // namespace at diff --git a/torchgen/packaged/ATen/templates/RedispatchFunctions.h b/torchgen/packaged/ATen/templates/RedispatchFunctions.h new file mode 100644 index 00000000000..d89975a4a62 --- /dev/null +++ b/torchgen/packaged/ATen/templates/RedispatchFunctions.h @@ -0,0 +1,32 @@ +#pragma once + +// ${generated_comment} + +#ifdef TORCH_ASSERT_ONLY_METHOD_OPERATORS +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider using the at::_ops::{name}::redispatch() interface by including \ + the specific operator from +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +namespace redispatch { + ${function_redispatch_definitions} +} // namespace redispatch + +} diff --git a/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp b/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp new file mode 100644 index 00000000000..6463701a493 --- /dev/null +++ b/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp @@ -0,0 +1,49 @@ +// We register ops with a higher priority dispatch key (BackendSelect) than the usual backend-specific keys (e.g. CPU) +// which makes calls to the factory functions dispatch to here. +// We then 'manually' compute a lower-priority to re-dispatch to (e.g. CPU) to get to the eventually correct backend. +// ${generated_comment} + +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include + +${ops_headers} +#endif + +namespace at { + +namespace { + +${backend_select_method_definitions} + +bool is_pinned(const Tensor& self, c10::optional device) { + // Only CPU tensors can be pinned + if (!self.is_cpu()) { + return false; + } + // TODO: fetch scalar type from Tensor? But it doesn't really matter... + DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA))); + return at::_ops::is_pinned::redispatch(_dk, self, device); +} + +at::Tensor _pin_memory(const Tensor& self, c10::optional device) { + TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned"); + DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA))); + return at::_ops::_pin_memory::redispatch(_dk, self, device); +} + +TORCH_LIBRARY_IMPL(aten, BackendSelect, m) { + ${backend_select_function_registrations}; + m.impl(TORCH_SELECTIVE_NAME("aten::is_pinned"), TORCH_FN(is_pinned)); + m.impl(TORCH_SELECTIVE_NAME("aten::_pin_memory"), TORCH_FN(_pin_memory)); +} + +} // namespace +} // at diff --git a/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp b/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp new file mode 100644 index 00000000000..279f987c66a --- /dev/null +++ b/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp @@ -0,0 +1,41 @@ +#include +#include +#include + +#include + +// ${generated_comment} + +// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up +// incremental rebuilds. See the comment at the top of +// templates/VariableType.cpp for an analogous, in-depth discussion. +// +// Generated by tools/jit/gen_unboxing.py. This file registers all ATen ops into JIT op registry instead of c10 +// dispatcher. JIT op registry only takes boxed kernels, so we are calling unboxing functions in UnboxingFunctions.h +// to cast arguments into C++ types (instead of IValue) and delegate to unboxed kernels. + +namespace torch { namespace jit { + +using autograd::Variable; +using autograd::variable_list; +using at::Scalar; +using at::ScalarType; +using at::Tensor; +using at::TensorOptions; +using at::DeviceGuard; + +using ::c10::fmap; +using ::c10::filter; + +namespace { + +RegisterOperators reg({ + + // Generated operators + ${unboxed_ops} +}); + +} // anon namespace + + +}} // namespace torch::jit diff --git a/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini b/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini new file mode 100644 index 00000000000..3bf7f9b1bb3 --- /dev/null +++ b/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini @@ -0,0 +1,24 @@ +${ns_prologue} + +// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid +// ambiguity with conflicting identifiers that may have been defined in +// at namespace already. +namespace { + +${dispatch_helpers} + +${dispatch_anonymous_definitions} + +${static_init_dispatch_registrations} + +} // anonymous namespace + +${deferred_dispatch_registrations} + +namespace ${dispatch_namespace} { + +${dispatch_namespaced_definitions} + +} // namespace ${dispatch_namespace} + +${ns_epilogue} diff --git a/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp b/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp new file mode 100644 index 00000000000..7a1584d505f --- /dev/null +++ b/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp @@ -0,0 +1,54 @@ +// required for old g++ to compile PRId64 macros, see +// https://github.com/pytorch/pytorch/issues/3571 +// for context +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +// an external backend might generate file within its code tree +// and check all the source files within the tree with clang-format. +// so, disable it since the backend might have a different config. +// clang-format off + +// NOTE: This condition is true for all PyTorch internal libraries, it +// just excludes external projects such as torch_xla which +// re-use some of the PyTorch codegen machinery. +#if defined(CAFFE2_BUILD_MAIN_LIB) || \ + defined(TORCH_CUDA_BUILD_MAIN_LIB) || \ + defined(TORCH_HIP_BUILD_MAIN_LIB) || \ + defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) || \ + defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB) +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#endif + +// ${generated_comment} + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +$extra_cuda_headers +$external_backend_headers +$dispatch_headers +$ops_headers + +// See template file RegisterDispatchDefinitions.ini +$dispatch_definitions diff --git a/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp b/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp new file mode 100644 index 00000000000..819e9e98e42 --- /dev/null +++ b/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp @@ -0,0 +1,108 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +// ${generated_comment} + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +// needed for the meta tensor calls to get stride info in functionalization +#include +// needed for special handling of copy_(). +// See Note [functionalizating copy_() and not preserving strides] +#include +#include + +$ops_headers +#endif + +namespace at { +namespace functionalization { + +// This keyset is used by functionalization when it calls into meta kernels +// to accurately propagate stride metadata. +// Exclude any modes: the purpose of calling into meta kernels is only as an implementation +// detail to perform shape inference, and we don't want any modal keys to run. +// Specifically, we want to prevent functionalization and Python modes from running. +constexpr auto exclude_keys_for_meta_dispatch = + c10::functorch_transforms_ks | + c10::DispatchKeySet({ + c10::DispatchKey::FuncTorchDynamicLayerBackMode, + c10::DispatchKey::FuncTorchDynamicLayerFrontMode, + c10::DispatchKey::Python + }); + +// Helper around at::has_internal_overlap. +// The ATen util is used in hot-path eager mode: it's always fast, +// but might return TOO_HARD sometimes. +// During functionalization, we're ok taking a bit longer +// to detect memory overlap. +inline bool has_internal_overlap_helper(const at::Tensor t) { + auto has_overlap = at::has_internal_overlap(t); + if (has_overlap == at::MemOverlap::Yes) return true; + if (has_overlap == at::MemOverlap::No) return false; + return false; +} + + +inline Tensor to_meta(const Tensor& t) { + if (!t.defined()) return t; + return at::native::empty_strided_meta_symint(t.sym_sizes(), t.sym_strides(), +/*dtype=*/c10::make_optional(t.scalar_type()), /*layout=*/c10::make_optional(t.layout()), +/*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt); +} + +inline c10::optional to_meta(const c10::optional& t) { + if (t.has_value()) { + return c10::make_optional(to_meta(*t)); + } + return c10::nullopt; +} + +inline std::vector to_meta(at::ITensorListRef t_list) { + std::vector outputs; + outputs.reserve(t_list.size()); + for (const auto& tensor : t_list) { + outputs.push_back(to_meta(tensor)); + } + return outputs; +} + +inline c10::List to_meta(const c10::List& t_list) { + c10::List outputs; + outputs.reserve(t_list.size()); + for (const auto i : c10::irange(t_list.size())) { + outputs.push_back(to_meta(t_list[i])); + } + return outputs; +} + +inline c10::List> to_meta(const c10::List>& t_list) { + c10::List> outputs; + outputs.reserve(t_list.size()); + for (const auto i : c10::irange(t_list.size())) { + outputs.push_back(to_meta(t_list[i])); + } + return outputs; +} + + +${func_definitions} + +} // namespace functionalization + +namespace { + +TORCH_LIBRARY_IMPL(aten, Functionalize, m) { + ${func_registrations}; +} + +} // namespace + +} // namespace at diff --git a/torchgen/packaged/ATen/templates/RegisterSchema.cpp b/torchgen/packaged/ATen/templates/RegisterSchema.cpp new file mode 100644 index 00000000000..029796d3e57 --- /dev/null +++ b/torchgen/packaged/ATen/templates/RegisterSchema.cpp @@ -0,0 +1,13 @@ +// ${generated_comment} +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +namespace at { +TORCH_LIBRARY(aten, m) { + ${aten_schema_registrations}; + // Distributed Ops + // Implementations located in torch/csrc/jit/runtime/register_distributed_ops.cpp + m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)"); +} +${schema_registrations} +} // namespace at diff --git a/torchgen/packaged/ATen/templates/RegistrationDeclarations.h b/torchgen/packaged/ATen/templates/RegistrationDeclarations.h new file mode 100644 index 00000000000..5a0f0d0c7b4 --- /dev/null +++ b/torchgen/packaged/ATen/templates/RegistrationDeclarations.h @@ -0,0 +1,4 @@ +// This file contains all native_functions that can be registered to +// and the schema string that they should be registered with + +${registration_declarations} diff --git a/torchgen/packaged/ATen/templates/TensorBody.h b/torchgen/packaged/ATen/templates/TensorBody.h new file mode 100644 index 00000000000..084bec26aec --- /dev/null +++ b/torchgen/packaged/ATen/templates/TensorBody.h @@ -0,0 +1,751 @@ +#pragma once + +#ifdef TORCH_ASSERT_NO_OPERATORS +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if your change would be better placed in \ + another file, or if a more specific header might achieve the same goal. \ + See NOTE: [Tensor vs. TensorBase] +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +namespace c10{ +template class List; +template class IListRef; +} +namespace at { +struct Generator; +struct Type; +class DeprecatedTypeProperties; +class Tensor; +} // namespace at +namespace at { +namespace indexing { +struct TensorIndex; +} // namespace indexing +} // namespace at + +namespace torch { namespace autograd { + +struct Node; + +}} // namespace torch::autograd + +namespace at { + +class OptionalTensorRef; +class Tensor; +using TensorList = ArrayRef; +using ITensorList = c10::IListRef; + +using Stream = c10::Stream; + +// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which +// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr. +// +// For example: +// +// void func(Tensor a) { +// Tensor b = a; +// ... +// } +// +// In this example, when we say Tensor b = a, we are creating a new object that points to the +// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the +// destructor decrements the reference count by calling release() on the TensorImpl it points to. +// The existing constructors, operator overloads, etc. take care to implement the correct semantics. +// +// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and +// special care must be taken to handle this. +class TORCH_API Tensor: public TensorBase { + protected: + // Create a Tensor with a +0 reference count. Special care must be + // taken to avoid decrementing this reference count at destruction + // time. Intended to support MaybeOwnedTraits. + explicit Tensor(unsafe_borrow_t, const TensorBase& rhs): TensorBase(unsafe_borrow_t{}, rhs) {} + friend MaybeOwnedTraits; + friend OptionalTensorRef; + + public: + Tensor() = default; + // This constructor should not be used by end users and is an implementation + // detail invoked by autogenerated code. + explicit Tensor( + c10::intrusive_ptr tensor_impl) + : TensorBase(std::move(tensor_impl)) {} + Tensor(const Tensor &tensor) = default; + Tensor(Tensor &&tensor) = default; + + // Implicitly move-constructible from TensorBase, but must be explicit to increase refcount + explicit Tensor(const TensorBase &base): TensorBase(base) {} + /*implicit*/ Tensor(TensorBase &&base): TensorBase(std::move(base)) {} + + // Creates a new wrapper from TensorImpl. Intentionally a free method because + // it should be used with care. Checks necessary invariants + static Tensor wrap_tensor_impl( + c10::intrusive_ptr tensor_impl) { + return TensorBase::wrap_tensor_impl(std::move(tensor_impl)); + } + + Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const { + return TensorBase::contiguous(memory_format); + } + + Tensor conj() const { + if (!this->is_complex()) { + return *this; + } + + switch (this->layout()) { + case at::kSparse: + case at::kSparseCsr: + case at::kSparseCsc: + case at::kSparseBsr: + case at::kSparseBsc: + return this->conj_physical(); + default: + return this->_conj(); + } + } + + // Aliased by Dimname overloads, so need explicit using + using TensorBase::size; + using TensorBase::sym_size; + using TensorBase::stride; + + /// Should be used if *this can reasonably be expected to be contiguous and + /// performance is important. + /// Compared to contiguous, it saves a reference count + /// increment/decrement if *this is already contiguous, at the cost + /// in all cases of an extra pointer of stack usage, an extra branch + /// to access, and an extra branch at destruction time. + c10::MaybeOwned expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const &; + + // Use .contiguous() instead. Trying to borrow from a prvalue Tensor + // will only lead to trouble and dangling references. + c10::MaybeOwned expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete; + + // The following overloads are very intruiging. Consider the following + // program: + // + // x[1] = 3; + // + // We would expect that the first entry of x is written to 3. But how can we + // actually achieve this? x[1] evaluates to a tensor... + // + // The answer is, using a ref-qualifier. x[1] is an rvalue, which cannot be + // (profitably) assigned to in the traditional sense, so we overload + // assignment to mean, "Actually, copy 3 into the tensor data." This is done + // with an rvalue-reference ref-qualified overload (the methods with && at the + // end of their type.) + // + // There's one more fly in the ointment: We also want + // + // Tensor x = y; + // + // to work, and we want it NOT to copy. So we need a traditional operator= + // overload. But we MUST specify a mutable lvalue ref-qualifier, to + // disambiguate the traditional overload from the rvalue-reference + // ref-qualified overload. Otherwise, it will be ambiguous, because + // a non ref-qualified method is eligible for all situations. + + // Unfortunately, we have to write these constructors out manually + // to work around an MSVC bug: + // error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &': + // multiple versions of a defaulted special member functions are not allowed + // Tensor& operator=(const Tensor&) & = default; + // Tensor& operator=(Tensor&&) & = default; + + // Also MSVC will wrongly issue the following warning with the aforementioned fix + // warning C4522: 'at::Tensor': multiple assignment operators specified + // Let's just skip the warning. + // + // TODO: temporarily disabled + + Tensor& operator=(const TensorBase& x) & { + impl_ = x.getIntrusivePtr(); + return *this; + } + Tensor& operator=(TensorBase&& x) & noexcept { + impl_ = x.unsafeReleaseIntrusivePtr(); + return *this; + } + + Tensor& operator=(const Tensor &x) & { + return operator=(static_cast(x)); + } + Tensor& operator=(Tensor &&x) & noexcept { + return operator=(static_cast(x)); + } + + Tensor& operator=(const Scalar &v) && { + return fill_(v); + } + Tensor& operator=(const Tensor &rhs) && { + return copy_(rhs); + } + Tensor& operator=(Tensor&& rhs) && { + return copy_(rhs); + } + + C10_DEPRECATED_MESSAGE("Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device().") + DeprecatedTypeProperties & type() const { + return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( + dispatchKeyToBackend(legacyExtractDispatchKey(key_set())), + scalar_type()); + } + + Tensor toType(ScalarType t) const { + return to(options().dtype(t), /*non_blocking*/ false, /*copy*/ false); + } + + // TODO: Deprecate me + Tensor toBackend(Backend b) const { + return to(options().device(backendToDeviceType(b)).layout(layout_from_backend(b)), /*non_blocking*/ false, /*copy*/ false); + } + + C10_DEPRECATED_MESSAGE("Tensor.is_variable() is deprecated; everything is a variable now. (If you want to assert that variable has been appropriately handled already, use at::impl::variable_excluded_from_dispatch())") + bool is_variable() const noexcept { + return !at::impl::variable_excluded_from_dispatch(); + } + + template + C10_DEPRECATED_MESSAGE("Tensor.data() is deprecated. Please use Tensor.data_ptr() instead.") + T * data() const { + return data_ptr(); + } + + template + T item() const; + + template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> + C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead") + GenericPackedTensorAccessor packed_accessor() const & { + return generic_packed_accessor(); + } + template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> + C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead") + GenericPackedTensorAccessor packed_accessor() && = delete; + + Tensor operator~() const { + return bitwise_not(); + } + Tensor operator-() const { + return neg(); + } + Tensor& operator+=(const Tensor & other) { + return add_(other); + } + Tensor& operator+=(const Scalar & other) { + return add_(other); + } + Tensor& operator-=(const Tensor & other) { + return sub_(other); + } + Tensor& operator-=(const Scalar & other) { + return sub_(other); + } + Tensor& operator*=(const Tensor & other) { + return mul_(other); + } + Tensor& operator*=(const Scalar & other) { + return mul_(other); + } + Tensor& operator/=(const Tensor & other) { + return div_(other); + } + Tensor& operator/=(const Scalar & other) { + return div_(other); + } + Tensor& operator&=(const Tensor & other) { + return bitwise_and_(other); + } + Tensor& operator|=(const Tensor & other) { + return bitwise_or_(other); + } + Tensor& operator^=(const Tensor & other) { + return bitwise_xor_(other); + } + Tensor operator[](const Scalar & index) const { + if (!index.isIntegral(false)) { + TORCH_CHECK_INDEX(false, "Can only index tensors with integral scalars"); + } + return this->operator[](index.toLong()); + } + Tensor operator[](const Tensor & index) const { + // These properties are checked in the Scalar constructor, but we already + // check them here to provide more useful diagnostics for the user. + if (!index.defined()) { + TORCH_CHECK_INDEX(false, "Can only index with tensors that are defined"); + } + if (index.dim() != 0) { + TORCH_CHECK_INDEX(false, + "Can only index with tensors that are scalars (zero-dim)"); + } + // The Scalar(Tensor) constructor is explicit, so we need to call it. + return this->operator[](index.item()); + } + Tensor operator[](int64_t index) const { + return select(0, index); + } + + Tensor index(ArrayRef indices) const; + Tensor index(std::initializer_list indices) const; + + Tensor & index_put_(ArrayRef indices, Tensor const & rhs); + Tensor & index_put_(ArrayRef indices, const Scalar& v); + Tensor & index_put_(std::initializer_list indices, Tensor const & rhs); + Tensor & index_put_(std::initializer_list indices, const Scalar& v); + + Tensor cpu() const { + return to(options().device(DeviceType::CPU), /*non_blocking*/ false, /*copy*/ false); + } + + // TODO: The Python version also accepts arguments + Tensor cuda() const { + return to(options().device(DeviceType::CUDA), /*non_blocking*/ false, /*copy*/ false); + } + + Tensor hip() const { + return to(options().device(DeviceType::HIP), /*non_blocking*/ false, /*copy*/ false); + } + + Tensor ve() const { + return to(options().device(DeviceType::VE), /*non_blocking*/ false, /*copy*/ false); + } + + Tensor vulkan() const { + return to(options().device(DeviceType::Vulkan), /*non_blocking*/ false, /*copy*/ false); + } + + Tensor metal() const { + return to(options().device(DeviceType::Metal), /*non_blocking*/ false, /*copy*/ false); + } + + Tensor meta() const { + return to(options().device(DeviceType::Meta), /*non_blocking*/ false, /*copy*/ false); + } + + // ~~~~~ Autograd API ~~~~~ + + /// \fn bool is_leaf() const; + /// + /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention. + /// + /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were + /// created by the user. This means that they are not the result of an operation and so + /// `grad_fn()` is `nullptr`. + /// + /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`. + /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`. + /// + /// Example: + /// @code + /// auto a = torch::rand(10, torch::requires_grad()); + /// std::cout << a.is_leaf() << std::endl; // prints `true` + /// + /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA); + /// std::cout << b.is_leaf() << std::endl; // prints `false` + /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor + /// + /// auto c = torch::rand(10, torch::requires_grad()) + 2; + /// std::cout << c.is_leaf() << std::endl; // prints `false` + /// // c was created by the addition operation + /// + /// auto d = torch::rand(10).cuda(); + /// std::cout << d.is_leaf() << std::endl; // prints `true` + /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine) + /// + /// auto e = torch::rand(10).cuda().requires_grad_(); + /// std::cout << e.is_leaf() << std::endl; // prints `true` + /// // e requires gradients and has no operations creating it + /// + /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true)); + /// std::cout << f.is_leaf() << std::endl; // prints `true` + /// // f requires grad, has no operation creating it + /// @endcode + + /// \fn void backward(const Tensor & gradient={}, c10::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const; + /// + /// Computes the gradient of current tensor with respect to graph leaves. + /// + /// The graph is differentiated using the chain rule. If the tensor is + /// non-scalar (i.e. its data has more than one element) and requires + /// gradient, the function additionally requires specifying ``gradient``. + /// It should be a tensor of matching type and location, that contains + /// the gradient of the differentiated function w.r.t. this Tensor. + /// + /// This function accumulates gradients in the leaves - you might need to + /// zero them before calling it. + /// + /// \param gradient Gradient w.r.t. the + /// tensor. If it is a tensor, it will be automatically converted + /// to a Tensor that does not require grad unless ``create_graph`` is True. + /// None values can be specified for scalar Tensors or ones that + /// don't require grad. If a None value would be acceptable then + /// this argument is optional. + /// \param retain_graph If ``false``, the graph used to compute + /// the grads will be freed. Note that in nearly all cases setting + /// this option to True is not needed and often can be worked around + /// in a much more efficient way. Defaults to the value of + /// ``create_graph``. + /// \param create_graph If ``true``, graph of the derivative will + /// be constructed, allowing to compute higher order derivative + /// products. Defaults to ``false``. + /// \param inputs Inputs w.r.t. which the gradient will be accumulated into + /// ``at::Tensor::grad``. All other Tensors will be ignored. If not + /// provided, the gradient is accumulated into all the leaf Tensors + /// that were used to compute the current tensor. + /// When inputs are provided and a given input is not a leaf, + /// the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients). + /// It is an implementation detail on which the user should not rely. + /// See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details. + void backward(const Tensor & gradient={}, c10::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const { + // NB: Adding this wrapper to _backward here because we'd like our + // 'backwards' api to accept the 'inputs' argument optionally. Since code gen + // currently does not support optional of TensorList our approach is to replace + // backward in native_functions.yaml with _backward and call it here instead. + if (inputs.has_value()) { + TORCH_CHECK(inputs.value().size() > 0, "'inputs' argument to backward cannot be empty") + this->_backward(inputs.value(), gradient, retain_graph, create_graph); + } else { + this->_backward({}, gradient, retain_graph, create_graph); + } + } + + /// \fn Tensor detach() const; + /// + /// Returns a new Tensor, detached from the current graph. + /// The result will never require gradient. + + /// \fn Tensor & detach_() const; + /// + /// Detaches the Tensor from the graph that created it, making it a leaf. + /// Views cannot be detached in-place. + + /// \fn void retain_grad() const; + /// + /// Enables this Tensor to have their :attr:`grad` populated during + /// :func:`backward`. This is a no-op for leaf tensors. + + /// \fn bool retains_grad() const; + /// + /// Is ``true`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be + /// populated during :func:`backward`, ``false`` otherwise. + + const Tensor& set_requires_grad(bool requires_grad) const { + TensorBase::set_requires_grad(requires_grad); + return *this; + } + + /// Return a mutable reference to the gradient. This is conventionally + /// used as `t.grad() = x` to set a gradient to a completely new tensor. + /// Note that this function work with a non-const Tensor and is not + /// thread safe. + Tensor& mutable_grad() const { + return impl_->mutable_grad(); + } + + /// This function returns an undefined tensor by default and returns a defined tensor + /// the first time a call to `backward()` computes gradients for this Tensor. + /// The attribute will then contain the gradients computed and future calls + /// to `backward()` will accumulate (add) gradients into it. + const Tensor& grad() const { + const Tensor& maybe_grad = impl_->grad(); + if (!is_leaf() && !retains_grad() && !maybe_grad.defined()) { + TORCH_WARN( + "The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad " + "attribute won't be populated during autograd.backward(). If you indeed want the .grad " + "field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. " + "If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor " + "instead. See github.com/pytorch/pytorch/pull/30531 for more informations."); + } + return maybe_grad; + } + + // The Forward AD API functions below are low level and are not to be used by end + // users who should use the API provided in torch/csrc/autograd.h + + /// This function returns the forward gradient for this Tensor at the given level. + const Tensor& _fw_grad(uint64_t level) const { + return impl_->_fw_grad(level, *this); + } + + /// This function can be used to set the value of the forward grad. + /// Note that the given new_grad might not be used directly if it has different + /// metadata (size/stride/storage offset) compared to this Tensor. In that case, + /// new_grad content will be copied into a new Tensor + void _set_fw_grad(const TensorBase& new_grad, uint64_t level, bool is_inplace_op) const { + impl_->_set_fw_grad(new_grad, *this, level, is_inplace_op); + } + + + // STOP. Thinking of adding a method here, which only makes use + // of other ATen methods? Define it in native_functions.yaml. + + //example + //Tensor * add(Tensor & b); + ${tensor_method_declarations} + + // Special C++ only overloads for std()-like functions (See gh-40287) + // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef + // So, for example std(0) would select the std(unbiased=False) overload + + Tensor var(int dim) const { + return var(IntArrayRef{dim}); + } + + Tensor std(int dim) const { + return std(IntArrayRef{dim}); + } + + // We changed .dtype() to return a TypeMeta in #12766. Ideally, we want the + // at::kDouble and its friends to be TypeMeta's, but that hasn't happened yet. + // Before that change, we make this method to maintain BC for C++ usage like + // `x.to(y.dtype)`. + // TODO: remove following two after at::kDouble and its friends are TypeMeta's. + inline Tensor to(caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const { + return this->to(/*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy); + } + inline Tensor to(Device device, caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const { + return this->to(device, /*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy); + } + + template + decltype(auto) m(F func, Args&&... params) const { + return func(*this, std::forward(params)...); + } + + /// NOTE: This is similar to the legacy `.data()` function on `Variable`, and is intended + /// to be used from functions that need to access the `Variable`'s equivalent `Tensor` + /// (i.e. `Tensor` that shares the same storage and tensor metadata with the `Variable`). + /// + /// One notable difference with the legacy `.data()` function is that changes to the + /// returned `Tensor`'s tensor metadata (e.g. sizes / strides / storage / storage_offset) + /// will not update the original `Variable`, due to the fact that this function + /// shallow-copies the `Variable`'s underlying TensorImpl. + at::Tensor tensor_data() const { + return TensorBase::tensor_data(); + } + + /// NOTE: `var.variable_data()` in C++ has the same semantics as `tensor.data` + /// in Python, which create a new `Variable` that shares the same storage and + /// tensor metadata with the original `Variable`, but with a completely new + /// autograd history. + /// + /// NOTE: If we change the tensor metadata (e.g. sizes / strides / + /// storage / storage_offset) of a variable created from `var.variable_data()`, those + /// changes will not update the original variable `var`. In `.variable_data()`, we set + /// `allow_tensor_metadata_change_` to false to make such changes explicitly illegal, + /// in order to prevent users from changing metadata of `var.variable_data()` + /// and expecting the original variable `var` to also be updated. + at::Tensor variable_data() const { + return TensorBase::variable_data(); + } + + // Hooks + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + template + using hook_return_void_t = std::enable_if_t>::value, unsigned>; + template + using hook_return_var_t = std::enable_if_t, Tensor>::value, unsigned>; + + /// Registers a backward hook. + /// + /// The hook will be called every time a gradient with respect to the Tensor is computed. + /// The hook should have one of the following signature: + /// ``` + /// hook(Tensor grad) -> Tensor + /// ``` + /// ``` + /// hook(Tensor grad) -> void + /// ``` + /// The hook should not modify its argument, but it can optionally return a new gradient + /// which will be used in place of `grad`. + /// + /// This function returns the index of the hook in the list which can be used to remove hook. + /// + /// Example: + /// @code + /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad()); + /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient + /// v.backward(torch::tensor({1., 2., 3.})); + /// // This prints: + /// // ``` + /// // 2 + /// // 4 + /// // 6 + /// // [ CPUFloatType{3} ] + /// // ``` + /// std::cout << v.grad() << std::endl; + /// v.remove_hook(h); // removes the hook + /// @endcode + template + hook_return_void_t register_hook(T&& hook) const; + template + hook_return_var_t register_hook(T&& hook) const; + + // Variable methods + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Tensor data() const { + return TensorBase::data(); + } + + void _backward(TensorList inputs, const c10::optional& gradient, c10::optional keep_graph, bool create_graph) const; + + const Tensor& requires_grad_(bool _requires_grad=true) const { + TensorBase::requires_grad_(_requires_grad); + return *this; + } +}; + +namespace detail { +// Helper creator for Tensor class which doesn't requires the users to pass +// in an intrusive_ptr instead it just converts the argument passed to +// requested intrusive_ptr type. +template +Tensor make_tensor(Args&&... args) { + return Tensor(c10::make_intrusive(std::forward(args)...)); +} + +} // namespace detail + +} // namespace at + + +namespace at { +${tensor_method_definitions} +} // namespace at + + +namespace c10 { +template <> +struct MaybeOwnedTraits { + using owned_type = at::Tensor; + using borrow_type = at::Tensor; + + static borrow_type createBorrow(const owned_type& from) { + // NOTE: this can be implemented without the special + // unsafe_borrow_t Tensor constructor as + // + // return borrow_type(c10::intrusive_ptr::reclaim(from.unsafeGetTensorImpl())); + // + // but that hurts inlining due to the nullptr check in the + // Tensor(c10::intrusive_ptr<...>) constructor. We already know + // that from.impl_ isn't null because from is a valid Tensor, so + // we needn't do the check again. (using __builtin_assume can + // avoid this, but wouldn't be portable to MSVC.) + return borrow_type(borrow_type::unsafe_borrow_t{}, from); + } + + static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) { + lhs.unsafeReleaseTensorImpl(); + // See above note: this can be implemented with public API + // similarly to createBorrow(), but that would hurt inlining. + lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs); + } + + static void destroyBorrow(borrow_type& toDestroy) { + toDestroy.unsafeReleaseTensorImpl(); // "leak" it, but it was already +0. + } + + static const owned_type& referenceFromBorrow(const borrow_type& borrow) { + return borrow; + } + + static const owned_type* pointerFromBorrow(const borrow_type& borrow) { + return &borrow; + } + + static bool debugBorrowIsValid(const borrow_type& /*borrow*/) { + return true; + } +}; + +template <> +struct ExclusivelyOwnedTraits { + using repr_type = at::Tensor; + using pointer_type = at::Tensor*; + using const_pointer_type = const at::Tensor*; + + static repr_type nullRepr() { + return at::Tensor(); + } + + template + static repr_type createInPlace(Args&&... args) { + return at::Tensor(std::forward(args)...); + } + + static repr_type moveToRepr(at::Tensor&& x) { + return std::move(x); + } + + static void destroyOwned(at::Tensor& x) { + return ExclusivelyOwnedTraits::destroyOwned(x); + } + + static at::Tensor take(at::Tensor& x) { + return std::move(x); + } + + static pointer_type getImpl(repr_type& x) { + return &x; + } + + static const_pointer_type getImpl(const repr_type& x) { + return &x; + } +}; +} // namespace c10 + +namespace at { + +inline c10::MaybeOwned borrow_from_optional_tensor( + const c10::optional& opt) { + return opt.has_value() + ? c10::MaybeOwned::borrowed(*opt) + : c10::MaybeOwned::owned(c10::in_place); +} + +inline c10::MaybeOwned Tensor::expect_contiguous(MemoryFormat memory_format) const & { + if (is_contiguous(memory_format)) { + return c10::MaybeOwned::borrowed(*this); + } else { + return c10::MaybeOwned::owned(__dispatch_contiguous(memory_format)); + } +} +} // namespace at diff --git a/torchgen/packaged/ATen/templates/TensorMethods.cpp b/torchgen/packaged/ATen/templates/TensorMethods.cpp new file mode 100644 index 00000000000..68764fbbf58 --- /dev/null +++ b/torchgen/packaged/ATen/templates/TensorMethods.cpp @@ -0,0 +1,52 @@ +#include +#include + +#include + +namespace at { + +namespace { + +// Verifies the requested type is the same as the Tensor's type. +void check_type(const TensorBase& tensor, ScalarType type, c10::string_view type_name) { + TORCH_CHECK( + tensor.scalar_type() == type + || (isQIntType(tensor.scalar_type()) + && toUnderlying(tensor.scalar_type()) == type), + "expected scalar type ", type_name, " but found ", tensor.scalar_type()); +} + +} // namespace + +#define DEFINE_CAST(T, name) \ + template <> \ + TORCH_API const T* TensorBase::const_data_ptr() const { \ + check_type(*this, ScalarType::name, #name); \ + return this->unsafeGetTensorImpl()->data_ptr_impl(); \ + } \ + \ + template <> \ + TORCH_API T* TensorBase::mutable_data_ptr() const { \ + check_type(*this, ScalarType::name, #name); \ + return this->unsafeGetTensorImpl()->mutable_data_ptr_impl(); \ + } \ + \ + template <> \ + TORCH_API T* TensorBase::data_ptr() const { \ + return mutable_data_ptr(); \ + } \ + + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CAST) + AT_FORALL_QINT_TYPES(DEFINE_CAST) + #undef DEFINE_CAST + + #define DEFINE_ITEM(T, name) \ + template <> \ + TORCH_API T Tensor::item() const { \ + return item().to##name(); \ + } + + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ITEM) + #undef DEFINE_ITEM + + } //namespace at diff --git a/torchgen/packaged/ATen/templates/UfuncCPU.cpp b/torchgen/packaged/ATen/templates/UfuncCPU.cpp new file mode 100644 index 00000000000..6b363a50890 --- /dev/null +++ b/torchgen/packaged/ATen/templates/UfuncCPU.cpp @@ -0,0 +1,19 @@ +#define TORCH_ASSERT_NO_OPERATORS + +#include +#include +#include + +namespace at { + +// NB: this is explicitly copied here (via codegen) rather than +// included via NativeFunctions.h to avoid recompiling this file when +// NativeFunctions.h changes +namespace meta { +${meta_declaration} +} + +namespace native { +${native_declaration} +${native_definitions} +}} // namespace at::native diff --git a/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp b/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp new file mode 100644 index 00000000000..0cac55664d6 --- /dev/null +++ b/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp @@ -0,0 +1,14 @@ +#define TORCH_ASSERT_NO_OPERATORS + +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +${native_definitions} +}} // namespace at::native diff --git a/torchgen/packaged/ATen/templates/UfuncCUDA.cu b/torchgen/packaged/ATen/templates/UfuncCUDA.cu new file mode 100644 index 00000000000..e75d82d9cc8 --- /dev/null +++ b/torchgen/packaged/ATen/templates/UfuncCUDA.cu @@ -0,0 +1,21 @@ +#define TORCH_ASSERT_NO_OPERATORS + +#include +#include +#include +#include +${cuda_headers} + +namespace at { + +// NB: this is explicitly copied here (via codegen) rather than +// included via NativeFunctions.h to avoid recompiling this file when +// NativeFunctions.h changes +namespace meta { +${meta_declaration} +} + +namespace native { +${native_declaration} +${native_definitions} +}} // namespace at::native diff --git a/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp b/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp new file mode 100644 index 00000000000..86c13235d86 --- /dev/null +++ b/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp @@ -0,0 +1,35 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace at { +namespace unboxing { + +using ::c10::fmap; +using ::c10::filter; +using torch::jit::peek; +using torch::jit::drop; +using torch::jit::pack; +using torch::jit::pop; + +// Generated function declaration +${definitions} + +} // namespace unboxing +} // namespace at diff --git a/torchgen/packaged/ATen/templates/UnboxingFunctions.h b/torchgen/packaged/ATen/templates/UnboxingFunctions.h new file mode 100644 index 00000000000..a65469a9b01 --- /dev/null +++ b/torchgen/packaged/ATen/templates/UnboxingFunctions.h @@ -0,0 +1,32 @@ +// ${generated_comment} + +// Generated by tools/jit/gen_unboxing.py. This file declares code generated boxed C++ functions for operators, +// base off of native_functions.yaml (or similar yaml file with the same syntax). The definition of such a boxed +// function will pop out IValues from the stack then convert them into the correct C++ types based on given schema. This +// unboxing logic is an alternative to template-based metaprogramming unboxing. + +#pragma once + +#include +namespace at { +namespace unboxing { +namespace { + +template +std::array as_array(const c10::List& list) { + std::array res; + AT_ASSERT(list.size() == N); + std::vector vec; + for (c10::IValue elem : list) { + vec.push_back(elem.to()); + } + std::copy(vec.begin(), vec.end(), res.begin()); + return res; +} +} // namespace +using Stack = std::vector; +// Generated function declaration +${declarations} + +} // namespace unboxing +} // namespace at diff --git a/torchgen/packaged/ATen/templates/aten_interned_strings.h b/torchgen/packaged/ATen/templates/aten_interned_strings.h new file mode 100644 index 00000000000..326d4622334 --- /dev/null +++ b/torchgen/packaged/ATen/templates/aten_interned_strings.h @@ -0,0 +1,22 @@ +#pragma once + +// ${generated_comment} + +#if defined(TORCH_ASSERT_NO_OPERATORS) || defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if including for \ + the c10::Symbol class would be sufficient, or if your change would be \ + better placed in another file. +#endif + +// ATen symbols correspond exactly to operators defined in ATen. Every +// symbol here corresponds exactly to an ATen operation defined in +// native_functions.yaml; attributes are in one-to-one correspondence +// with their ATen name. + +#define FORALL_ATEN_BASE_SYMBOLS(_) \ +${aten_symbols} + +#define FORALL_ATTR_BASE_SYMBOLS(_) \ +${attr_symbols} diff --git a/torchgen/packaged/ATen/templates/enum_tag.h b/torchgen/packaged/ATen/templates/enum_tag.h new file mode 100644 index 00000000000..1320fbc28ab --- /dev/null +++ b/torchgen/packaged/ATen/templates/enum_tag.h @@ -0,0 +1,10 @@ +#pragma once + +// ${generated_comment} + +namespace at { + // Enum of valid tags obtained from the entries in tags.yaml + enum class Tag { + ${enum_of_valid_tags} + }; +} diff --git a/torchgen/selective_build/__init__.py b/torchgen/selective_build/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchgen/selective_build/operator.py b/torchgen/selective_build/operator.py new file mode 100644 index 00000000000..0cb92dfc09e --- /dev/null +++ b/torchgen/selective_build/operator.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +# This class holds information about a single operator used to determine +# the outcome of a selective/custom PyTorch build that doesn't include +# registration code for all the supported operators. This is done to +# reduce the size of the generated binary so that it can be deployed in +# situations where binary size comes at a premium. +# +@dataclass(frozen=True) +class SelectiveBuildOperator: + # The name of the operator. This includes the aten::, etc... prefix + # The operator name may or may not have the overload name. If this + # operator name does not specify an overload name, the way to determine + # if this entry refers to the family of operators with this base name + # or just the operator with this name is to look at the value of the + # 'include_all_overloads' flag in this class. + name: str + + # True if this is a root operator (i.e. called directly from a + # TorchScript model, etc...). An operator is considered to be a + # root operator if it is called directly from any one of the models + # that this instance of the pytorch library was built for. Hence, it + # may not be a root operator in all of the models that are used in + # this instance of the pytorch library. + is_root_operator: bool + + # Is this operator used for on-device training? If True, then we need to + # use the information to generate code in VariableType_N.cpp for registration + # of training related operators. Again, this is True if this operator + # is used for training in one or more models used by this instance of the + # pytorch library. + is_used_for_training: bool + + # If True, it indicates that this operator instance (object) refers to an + # operator without the overload name and should apply to all overloads + # which have this operator name as the base name. This flag is applicable + # only for objects that have operator names without a DOT (period) character + # in them. + # + # Note: This flag is a temporary workaround to grandfather in the current + # static selective (custom) build mechanism, which largely ignores overload + # names when determining whether to select operators for registration + # purposes. + include_all_overloads: bool + + # Debug Information at the operator level + _debug_info: tuple[str, ...] | None + + @staticmethod + def from_yaml_dict( + op_name: str, op_info: dict[str, object] + ) -> SelectiveBuildOperator: + allowed_keys = { + "name", + "is_root_operator", + "is_used_for_training", + "include_all_overloads", + "debug_info", + } + + if len(set(op_info.keys()) - allowed_keys) > 0: + raise Exception( # noqa: TRY002 + "Got unexpected top level keys: {}".format( + ",".join(set(op_info.keys()) - allowed_keys), + ) + ) + + if "name" in op_info: + assert op_name == op_info["name"] + + is_root_operator = op_info.get("is_root_operator", True) + assert isinstance(is_root_operator, bool) + + is_used_for_training = op_info.get("is_used_for_training", True) + assert isinstance(is_used_for_training, bool) + + include_all_overloads = op_info.get("include_all_overloads", True) + assert isinstance(include_all_overloads, bool) + + debug_info: tuple[str, ...] | None = None + if "debug_info" in op_info: + di_list = op_info["debug_info"] + assert isinstance(di_list, list) + debug_info = tuple(str(x) for x in di_list) + + return SelectiveBuildOperator( + name=op_name, + is_root_operator=is_root_operator, + is_used_for_training=is_used_for_training, + include_all_overloads=include_all_overloads, + _debug_info=debug_info, + ) + + @staticmethod + def from_legacy_operator_name_without_overload( + name: str, + ) -> SelectiveBuildOperator: + return SelectiveBuildOperator( + name=name, + is_root_operator=True, + is_used_for_training=True, + include_all_overloads=True, + _debug_info=None, + ) + + def to_dict(self) -> dict[str, object]: + ret: dict[str, object] = { + "is_root_operator": self.is_root_operator, + "is_used_for_training": self.is_used_for_training, + "include_all_overloads": self.include_all_overloads, + } + if self._debug_info is not None: + ret["debug_info"] = self._debug_info + + return ret + + +def merge_debug_info( + lhs: tuple[str, ...] | None, + rhs: tuple[str, ...] | None, +) -> tuple[str, ...] | None: + # Ensure that when merging, each entry shows up just once. + if lhs is None and rhs is None: + return None + + return tuple(set((lhs or ()) + (rhs or ()))) + + +def combine_operators( + lhs: SelectiveBuildOperator, rhs: SelectiveBuildOperator +) -> SelectiveBuildOperator: + if str(lhs.name) != str(rhs.name): + raise Exception( # noqa: TRY002 + f"Expected both arguments to have the same name, but got '{str(lhs.name)}' and '{str(rhs.name)}' instead" + ) + + return SelectiveBuildOperator( + name=lhs.name, + # Consider this operator to be a root operator if it is a + # root operator in any of the models used in this instance of + # the pytorch library. + is_root_operator=lhs.is_root_operator or rhs.is_root_operator, + # Consider this operator to be a training operator if it is + # an operator used for training in any of the models used + # in this instance of the pytorch library. + is_used_for_training=lhs.is_used_for_training or rhs.is_used_for_training, + include_all_overloads=lhs.include_all_overloads or rhs.include_all_overloads, + _debug_info=merge_debug_info(lhs._debug_info, rhs._debug_info), + ) + + +def merge_operator_dicts( + lhs: dict[str, SelectiveBuildOperator], + rhs: dict[str, SelectiveBuildOperator], +) -> dict[str, SelectiveBuildOperator]: + operators: dict[str, SelectiveBuildOperator] = {} + for op_name, op in list(lhs.items()) + list(rhs.items()): + new_op = op + if op_name in operators: + new_op = combine_operators(operators[op_name], op) + + operators[op_name] = new_op + + return operators + + +def strip_operator_overload_name(op_name: str) -> str: + return op_name.split(".")[0] diff --git a/torchgen/selective_build/selector.py b/torchgen/selective_build/selector.py new file mode 100644 index 00000000000..04acc354203 --- /dev/null +++ b/torchgen/selective_build/selector.py @@ -0,0 +1,352 @@ +from __future__ import annotations + +from collections import defaultdict +from collections.abc import Iterable +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import yaml + +from torchgen.selective_build.operator import ( + merge_debug_info, + merge_operator_dicts, + SelectiveBuildOperator, + strip_operator_overload_name, +) + + +if TYPE_CHECKING: + from torchgen.model import NativeFunction + + +# A SelectiveBuilder holds information extracted from the selective build +# YAML specification. +# +# It includes information about the build's selectivity, the debug_info +# associated with this selective build (opaque string), and the set of +# operators that should be included in the build. +# +@dataclass(frozen=True) +class SelectiveBuilder: + # If true, then the build is not selective, and includes all + # operators. + include_all_operators: bool + + # Debug Information at the selective/custom build level. + _debug_info: tuple[str, ...] | None + + # A dictionary of operator -> operator metadata. + operators: dict[str, SelectiveBuildOperator] + + # A dictionary of selected kernel tags and dtypes. Typically a + # PyTorch Operator Kernel (function) may have many code paths + # that are specialized for many many Tensor dtypes, so it's not + # one per kernel function, but there could be many per kernel + # function. The tag isn't a kernel function name, but some fragment + # of the kernel function implementation itself. + kernel_metadata: dict[str, list[str]] + + # ExecuTorch only. A dictionary of kernel tag -> list of (list of input + # dtypes for tensor-like input args). + # This is from selective.yaml + et_kernel_metadata: dict[str, list[str]] + + # A set of all the custom torch bind classes used by the selected models + # Stored as a set internally to remove duplicates proactively, but written + # as a list to yamls + custom_classes: set[str] + + # A set of all the build features used by the selected models + # Stored as a set internally to remove duplicates proactively, but written + # as a list to yamls + build_features: set[str] + + # If true, then fragments for all dtypes for all kernel functions + # are included as well as all custom classes. This is typically set when any one of the + # operator lists is generated from a mechanism other than + # tracing based selective build. + include_all_non_op_selectives: bool + + @staticmethod + def get_nop_selector() -> SelectiveBuilder: + return SelectiveBuilder.from_yaml_dict({"include_all_operators": True}) + + @staticmethod + def from_yaml_dict(data: dict[str, object]) -> SelectiveBuilder: + valid_top_level_keys = { + "include_all_non_op_selectives", + "include_all_operators", + "debug_info", + "operators", + "kernel_metadata", + "et_kernel_metadata", + "custom_classes", + "build_features", + } + top_level_keys = set(data.keys()) + if len(top_level_keys - valid_top_level_keys) > 0: + raise Exception( # noqa: TRY002 + "Got unexpected top level keys: {}".format( + ",".join(top_level_keys - valid_top_level_keys), + ) + ) + include_all_operators = data.get("include_all_operators", False) + assert isinstance(include_all_operators, bool) + + debug_info = None + if "debug_info" in data: + di_list = data["debug_info"] + assert isinstance(di_list, list) + + debug_info = tuple(str(x) for x in di_list) + + operators = {} + operators_dict = data.get("operators", {}) + assert isinstance(operators_dict, dict) + + for k, v in operators_dict.items(): + operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v) + + kernel_metadata = {} + kernel_metadata_dict = data.get("kernel_metadata", {}) + assert isinstance(kernel_metadata_dict, dict) + + for k, v in kernel_metadata_dict.items(): + kernel_metadata[str(k)] = [str(dtype) for dtype in v] + + et_kernel_metadata = data.get("et_kernel_metadata", {}) + assert isinstance(et_kernel_metadata, dict) + + custom_classes = data.get("custom_classes", []) + assert isinstance(custom_classes, Iterable) + custom_classes = set(custom_classes) + + build_features = data.get("build_features", []) + assert isinstance(build_features, Iterable) + build_features = set(build_features) + + include_all_non_op_selectives = data.get("include_all_non_op_selectives", False) + assert isinstance(include_all_non_op_selectives, bool) + + return SelectiveBuilder( + include_all_operators, + debug_info, + operators, + kernel_metadata, + et_kernel_metadata, + custom_classes, # type: ignore[arg-type] + build_features, # type: ignore[arg-type] + include_all_non_op_selectives, + ) + + @staticmethod + def from_yaml_str(config_contents: str) -> SelectiveBuilder: + contents = yaml.safe_load(config_contents) + return SelectiveBuilder.from_yaml_dict(contents) + + @staticmethod + def from_yaml_path(config_path: str) -> SelectiveBuilder: + with open(config_path) as f: + contents = yaml.safe_load(f) + return SelectiveBuilder.from_yaml_dict(contents) + + @staticmethod + def from_legacy_op_registration_allow_list( + allow_list: set[str], is_root_operator: bool, is_used_for_training: bool + ) -> SelectiveBuilder: + operators = {} + for op in allow_list: + operators[op] = { + "name": op, + "is_root_operator": is_root_operator, + "is_used_for_training": is_used_for_training, + "include_all_overloads": True, + } + return SelectiveBuilder.from_yaml_dict( + { + "operators": operators, + "include_all_non_op_selectives": True, + } + ) + + def is_operator_selected(self, name: str) -> bool: + if self.include_all_operators: + return True + + if name in self.operators: + return True + name = strip_operator_overload_name(name) + return name in self.operators and self.operators[name].include_all_overloads + + def is_native_function_selected(self, func: NativeFunction) -> bool: + op_name = op_name_from_native_function(func) + return self.is_operator_selected(op_name) + + def is_operator_selected_for_training(self, name: str) -> bool: + if not self.is_operator_selected(name): + return False + if self.include_all_operators: + return True + + not_training_op = SelectiveBuildOperator( + name="", + is_root_operator=False, + is_used_for_training=False, + include_all_overloads=False, + _debug_info=None, + ) + op = not_training_op + if name in self.operators: + op = self.operators[name] + + name = strip_operator_overload_name(name) + base_op = not_training_op + if name in self.operators: + base_op = self.operators[name] + + return op.is_used_for_training or ( + base_op.include_all_overloads and base_op.is_used_for_training + ) + + def is_native_function_selected_for_training(self, func: NativeFunction) -> bool: + op_name = op_name_from_native_function(func) + return self.is_operator_selected_for_training(op_name) + + def is_root_operator(self, name: str) -> bool: + if not self.is_operator_selected(name): + return False + if self.include_all_operators: + return True + + if name in self.operators: + op: SelectiveBuildOperator = self.operators[name] + return op.is_root_operator + name = strip_operator_overload_name(name) + if name not in self.operators: + return False + base_op: SelectiveBuildOperator = self.operators[name] + return base_op.include_all_overloads and base_op.is_root_operator + + def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool: + if self.include_all_operators or self.include_all_non_op_selectives: + return True + + return ( + kernel_tag in self.kernel_metadata + and dtype in self.kernel_metadata[kernel_tag] + ) + + def et_get_selected_kernels(self, op_name: str, kernel_key: list[str]) -> list[str]: + """ + Return a list of kernel keys that cover the used ops + """ + # If no kernel metadata, either it's implied by include_all_operators=True or the op is not used. + if op_name not in self.et_kernel_metadata: + return kernel_key if self.include_all_operators else [] + # Otherwise, only return the specific kernel keys. + + result_set = set() + + for model_kernel_keys in self.et_kernel_metadata[op_name]: + key_found = False + for key in kernel_key: + # Don't compare the version for now + if ( + key != "default" + and key.split("/")[1] == model_kernel_keys.split("/")[1] + ): + result_set.add(key) + key_found = True + break + if not key_found: + if "default" not in kernel_key: + raise Exception("Missing kernel for the model") # noqa: TRY002 + else: + result_set.add("default") + + return list(result_set) + + def to_dict(self) -> dict[str, object]: + ret: dict[str, object] = { + "include_all_non_op_selectives": self.include_all_non_op_selectives, + "include_all_operators": self.include_all_operators, + } + operators = {} + for op_name, op in self.operators.items(): + operators[op_name] = op.to_dict() + ret["operators"] = operators + + if self._debug_info is not None: + ret["debug_info"] = sorted(self._debug_info) + + ret["kernel_metadata"] = { + k: sorted(v) for (k, v) in self.kernel_metadata.items() + } + + ret["et_kernel_metadata"] = self.et_kernel_metadata + + ret["custom_classes"] = sorted(self.custom_classes) + + ret["build_features"] = sorted(self.build_features) + + return ret + + +def merge_kernel_metadata( + lhs: dict[str, list[str]], + rhs: dict[str, list[str]], +) -> dict[str, list[str]]: + kernel_metadata: dict[str, list[str]] = {} + for tag_name, dtypes in list(lhs.items()) + list(rhs.items()): + dtypes_copy = set(dtypes) + if tag_name in kernel_metadata: + dtypes_copy |= set(kernel_metadata[tag_name]) + + kernel_metadata[tag_name] = list(dtypes_copy) + + return kernel_metadata + + +def merge_et_kernel_metadata( + lhs: dict[str, list[str]], + rhs: dict[str, list[str]], +) -> dict[str, list[str]]: + merge_et_kernel_metadata: dict[str, set[str]] = defaultdict(set) + for op in list(lhs.keys()) + list(rhs.keys()): + merge_et_kernel_metadata[op].update(lhs.get(op, [])) + merge_et_kernel_metadata[op].update(rhs.get(op, [])) + + return {op: sorted(val) for op, val in merge_et_kernel_metadata.items()} + + +def combine_selective_builders( + lhs: SelectiveBuilder, rhs: SelectiveBuilder +) -> SelectiveBuilder: + include_all_operators = lhs.include_all_operators or rhs.include_all_operators + debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info) + operators = merge_operator_dicts(lhs.operators, rhs.operators) + kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata) + et_kernel_metadata = merge_et_kernel_metadata( + lhs.et_kernel_metadata, rhs.et_kernel_metadata + ) + include_all_non_op_selectives = ( + lhs.include_all_non_op_selectives or rhs.include_all_non_op_selectives + ) + custom_classes = lhs.custom_classes.union(rhs.custom_classes) + build_features = lhs.build_features.union(rhs.build_features) + return SelectiveBuilder( + include_all_operators, + debug_info, + operators, + kernel_metadata, + et_kernel_metadata, + custom_classes, + build_features, + include_all_non_op_selectives, + ) + + +def op_name_from_native_function(f: NativeFunction) -> str: + # This was originally read from the 'operator_name_with_overload' field in the + # declaration dict, which was the part before the first '(' in 'schema_string'. + return f"{f.namespace}::{f.func.name}" diff --git a/torchgen/shape_functions/gen_jit_shape_functions.py b/torchgen/shape_functions/gen_jit_shape_functions.py new file mode 100644 index 00000000000..56a3d8bf0dd --- /dev/null +++ b/torchgen/shape_functions/gen_jit_shape_functions.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +import os +import sys +from importlib.util import module_from_spec, spec_from_file_location +from itertools import chain +from pathlib import Path + + +# Manually importing the shape function module based on current directory +# instead of torch imports to avoid needing to recompile Pytorch before +# running the script + +file_path = Path.cwd() / "torch" / "jit" / "_shape_functions.py" +module_name = "torch.jit._shape_functions" + +err_msg = """Could not find shape functions file, please make sure +you are in the root directory of the Pytorch git repo""" +if not file_path.exists(): + raise Exception(err_msg) # noqa: TRY002 + +spec = spec_from_file_location(module_name, file_path) +assert spec is not None +module = module_from_spec(spec) +sys.modules[module_name] = module +assert spec.loader is not None +assert module is not None +spec.loader.exec_module(module) + +bounded_compute_graph_mapping = module.bounded_compute_graph_mapping +shape_compute_graph_mapping = module.shape_compute_graph_mapping + + +SHAPE_HEADER = r""" +/** + * @generated + * This is an auto-generated file. Please do not modify it by hand. + * To re-generate, please run: + * cd ~/pytorch && python + * torchgen/shape_functions/gen_jit_shape_functions.py + */ +#include +#include +#include +#include + +// clang-format off + +namespace torch { +namespace jit { + + +std::string shape_funcs = "" +""" + + +DECOMP_CENTER = r""" + + +const std::string& GetSerializedShapeFunctions() { + return shape_funcs; +} + +""" + +DECOMP_END = r""" +// clang-format on + +} // namespace jit +} // namespace torch +""" + + +SERIALIZED_SHAPE_UTIL_FILE_NAME = "serialized_shape_function_registry.cpp" + + +def gen_serialized_decompisitions() -> str: + already_serialized_names = set() + unique_funcs = [] + all_funcs = chain( + shape_compute_graph_mapping.values(), *bounded_compute_graph_mapping.values() + ) + for scripted_func in all_funcs: + if scripted_func.name in already_serialized_names: + continue + already_serialized_names.add(scripted_func.name) + unique_funcs.append(scripted_func) + + output_strs = [] + curr_str = "" + for scripted_func in unique_funcs: + serialized_code = scripted_func.code + # technically its higher but give a buffer bc there are weird rules + # around some characters + # TODO: this was the limit I found by googling but it seems way + # too short ? + MAX_MSFT_STR_LEN = 2000 + if len(curr_str) + len(serialized_code) <= MAX_MSFT_STR_LEN: + curr_str += "\n" + serialized_code + else: + output_strs.append(curr_str) + curr_str = scripted_func.code + output_strs.append(curr_str) + + final_output = "" + # Windows compiler doesnt correctly handle adjacent + # string literals + for output_str in output_strs: + start = '+ std::string(R"=====(' + end = '\n)=====")\n' + final_output += start + output_str + end + final_output += ";" + return final_output + + +SHAPE_SCHEMA_START = r""" +const OperatorMap& GetShapeFunctionMappings() { + static const OperatorMap shape_mappings { +""" + +SHAPE_SCHEMA_END = r""" + }; + + return shape_mappings; +} +""" + + +def gen_shape_mappings() -> str: + shape_mappings = [] + for schema, scripted_func in shape_compute_graph_mapping.items(): + shape_mappings.append(' {"' + schema + '", "' + scripted_func.name + '"},') + return SHAPE_SCHEMA_START + "\n".join(shape_mappings) + SHAPE_SCHEMA_END + + +BOUNDED_SCHEMA_START = r""" +const OperatorMap>& GetBoundedShapeMappings() { + static const OperatorMap> shape_mappings { +""" + + +def gen_bounded_mappings() -> str: + bounded_mappings = [] + for schema, (lower_func, upper_func) in bounded_compute_graph_mapping.items(): + map_str = ( + ' {"' + + schema + + '", {"' + + lower_func.name + + '", "' + + upper_func.name + + '"}},' + ) + bounded_mappings.append(map_str) + return BOUNDED_SCHEMA_START + "\n".join(bounded_mappings) + SHAPE_SCHEMA_END + + +def write_decomposition_util_file(path: str) -> None: + decomposition_str = gen_serialized_decompisitions() + shape_mappings = gen_shape_mappings() + bounded_mappings = gen_bounded_mappings() + file_components = [ + SHAPE_HEADER, + decomposition_str, + DECOMP_CENTER, + shape_mappings, + bounded_mappings, + DECOMP_END, + ] + print("writing file to : ", path + "/" + SERIALIZED_SHAPE_UTIL_FILE_NAME) + with open(os.path.join(path, SERIALIZED_SHAPE_UTIL_FILE_NAME), "wb") as out_file: + final_output = "".join(file_components) + out_file.write(final_output.encode("utf-8")) + + +def main() -> None: + pytorch_dir = Path(__file__).resolve().parents[2] + upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "runtime" + write_decomposition_util_file(str(upgrader_path)) + + +if __name__ == "__main__": + main() diff --git a/torchgen/static_runtime/__init__.py b/torchgen/static_runtime/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/torchgen/static_runtime/config.py b/torchgen/static_runtime/config.py new file mode 100644 index 00000000000..1e7b541fa2c --- /dev/null +++ b/torchgen/static_runtime/config.py @@ -0,0 +1,388 @@ +from __future__ import annotations + +from torchgen.model import NativeFunctionsGroup, NativeFunctionsViewGroup + + +def func_name_base_str(g: NativeFunctionsGroup | NativeFunctionsViewGroup) -> str: + if isinstance(g, NativeFunctionsGroup): + return str(g.functional.func.name.name.base) + else: + return str(g.view.root_name) + + +is_hand_written_ops_ = frozenset( + ( + "abs", + "add", + "addmm", + "all", + "any", + "argmin", + "bmm", + "clamp", + "clamp_min", + "cumsum", + "div", + "fmod", + "index_select", + "leaky_relu", + "linear", + "log", + "matmul", + "mul", + "narrow_copy", + "nonzero", + "pow", + "remainder", + "sigmoid", + "sign", + "sub", + "tanh", + "detach", + "expand_as", + "flatten", + "narrow", + "reshape_as", + "select", + "slice", + "softmax", + "split", + "squeeze", + "transpose", + "view", + "where", + ) +) + + +def is_hand_written(g: NativeFunctionsGroup | NativeFunctionsViewGroup) -> bool: + name_base = func_name_base_str(g) + return name_base in is_hand_written_ops_ + + +def override_test_values(arg_map: dict[str, str], op_name: str, index: int) -> None: + assert index == 0 or index == 1 + if op_name == "addr": + if index == 0: + arg_map["self"] = "at::rand({6, 6})" + arg_map["vec1"] = "at::rand({6})" + arg_map["vec2"] = "at::rand({6})" + else: + arg_map["self"] = "at::rand({22, 22})" + arg_map["vec1"] = "at::rand({22})" + arg_map["vec2"] = "at::rand({22})" + return + if op_name == "mv": + if index == 0: + arg_map["self"] = "at::rand({6, 6})" + arg_map["vec"] = "at::rand({6})" + else: + arg_map["self"] = "at::rand({22, 22})" + arg_map["vec"] = "at::rand({22})" + return + if op_name == "addbmm": + if index == 0: + arg_map["self"] = "at::rand({6, 6})" + else: + arg_map["self"] = "at::rand({22, 22})" + return + if op_name == "cross": + if index == 0: + arg_map["self"] = "at::rand({3, 3, 3})" + arg_map["other"] = "at::rand({3, 3, 3})" + else: + arg_map["self"] = "at::rand({22, 3, 22})" + arg_map["other"] = "at::rand({22, 3, 22})" + return + if op_name == "take": + if index == 0: + arg_map["index"] = "at::randint(0, 216, {20}, torch::kInt64)" + else: + arg_map["index"] = "at::randint(0, 1000, {100}, torch::kInt64)" + return + if op_name == "take_along_dim": + if index == 0: + arg_map["indices"] = "at::argsort(self0, 1, true)" + else: + arg_map["indices"] = "at::argsort(self1, 1, true)" + return + if op_name == "masked_select": + if index == 0: + arg_map["mask"] = "at::randn({6, 6, 6}) > 0.5" + else: + arg_map["mask"] = "at::rand({22, 22, 22}) > 0.5" + return + if op_name == "orgqr": + if index == 0: + arg_map["input2"] = "at::rand({6, 6})" + else: + arg_map["input2"] = "at::rand({22, 22})" + return + if op_name == "ormqr": + if index == 0: + arg_map["input2"] = "at::rand({6, 6})" + else: + arg_map["input2"] = "at::rand({22, 22})" + return + if op_name == "quantile": + if index == 0: + arg_map["q"] = "at::rand({6})" + arg_map["interpolation"] = '"linear"' + else: + arg_map["q"] = "at::rand({22})" + arg_map["interpolation"] = '"linear"' + return + if op_name == "nanquantile": + if index == 0: + arg_map["q"] = "at::rand({6})" + arg_map["interpolation"] = '"linear"' + else: + arg_map["q"] = "at::rand({22})" + arg_map["interpolation"] = '"linear"' + return + if op_name == "multi_margin_loss": + if index == 0: + arg_map["self"] = "at::rand({6, 6})" + arg_map["target"] = "at::randint(6, {6}, torch::kInt64)" + arg_map["weight"] = "at::rand({6})" + else: + arg_map["self"] = "at::rand({22, 22})" + arg_map["target"] = "at::randint(22, {22}, torch::kInt64)" + arg_map["weight"] = "at::rand({22})" + return + if op_name == "multilabel_margin_loss": + if index == 0: + arg_map["self"] = "at::rand({6, 6})" + arg_map["target"] = "at::randint(6, {6, 6}, torch::kInt64)" + else: + arg_map["self"] = "at::rand({22, 22})" + arg_map["target"] = "at::randint(22, {22, 22}, torch::kInt64)" + return + if op_name == "nll_loss": + if index == 0: + arg_map["self"] = "at::rand({6, 6})" + arg_map["target"] = "at::randint(6, {6}, torch::kInt64)" + arg_map["weight"] = "at::rand({6})" + else: + arg_map["self"] = "at::rand({22, 22})" + arg_map["target"] = "at::randint(22, {22}, torch::kInt64)" + arg_map["weight"] = "at::rand({22})" + return + if op_name == "nll_loss2d": + if index == 0: + arg_map["self"] = "at::rand({6, 6, 6, 6})" + arg_map["target"] = "at::randint(6, {6, 6, 6}, torch::kInt64)" + arg_map["weight"] = "at::rand({6})" + else: + arg_map["self"] = "at::rand({22, 22, 22, 22})" + arg_map["target"] = "at::randint(22, {22, 22, 22}, torch::kInt64)" + arg_map["weight"] = "at::rand({22})" + return + if op_name in ( + "fft_fft", + "fft_ifft", + "fft_rfft", + "fft_irfft", + "fft_hfft", + "fft_ihfft", + ): + arg_map["norm"] = '"forward"' + return + if op_name == "linalg_tensorinv": + if index == 0: + arg_map["self"] = "at::rand({6, 6, 6, 6})" + arg_map["ind"] = "2" + else: + arg_map["self"] = "at::rand({22, 22, 22, 22})" + arg_map["ind"] = "2" + return + if op_name == "addmv": + if index == 0: + arg_map["self"] = "at::rand({2})" + arg_map["mat"] = "at::rand({2, 2})" + arg_map["vec"] = "at::rand({2})" + else: + arg_map["self"] = "at::rand({35})" + arg_map["mat"] = "at::rand({35, 35})" + arg_map["vec"] = "at::rand({35})" + return + if op_name == "acosh": + if index == 0: + arg_map["self"] = "at::rand({2, 2, 2}) + at::ones({2, 2, 2})" + else: + arg_map["self"] = "at::rand({5, 5, 5}) + at::ones({5, 5, 5})" + return + if op_name == "adaptive_max_pool2d_backward": + if index == 0: + arg_map["grad_output"] = "at::rand({2, 2, 2}, at::kFloat)" + arg_map["self"] = "at::rand({2, 2, 2}, at::kFloat)" + arg_map["indices"] = "at::randint(0, 1, {2, 2, 2}, at::kLong)" + else: + arg_map["grad_output"] = "at::rand({3, 3, 3}, at::kFloat)" + arg_map["self"] = "at::rand({3, 3, 3}, at::kFloat)" + arg_map["indices"] = "at::randint(0, 1, {3, 3, 3}, at::kLong)" + return + if op_name == "adaptive_max_pool3d_backward": + if index == 0: + arg_map["grad_output"] = "at::rand({2, 2, 2, 2}, at::kFloat)" + arg_map["self"] = "at::rand({2, 2, 2, 2}, at::kFloat)" + arg_map["indices"] = "at::randint(0, 1, {2, 2, 2, 2}, at::kLong)" + else: + arg_map["grad_output"] = "at::rand({3, 3, 3, 3}, at::kFloat)" + arg_map["self"] = "at::rand({3, 3, 3, 3}, at::kFloat)" + arg_map["indices"] = "at::randint(0, 1, {3, 3, 3, 3}, at::kLong)" + return + if op_name == "bitwise_left_shift": + if index == 0: + arg_map["self"] = "at::randint(1, 1 << 4, {6, 6, 6}, at::kInt)" + arg_map["other"] = "at::randint(1, 26, {6, 6, 6}, at::kInt)" + else: + arg_map["self"] = "at::randint(1, 1 << 4, {22, 22, 22}, at::kInt)" + arg_map["other"] = "at::randint(1, 26, {22, 22, 22}, at::kInt)" + return + if op_name == "bitwise_right_shift": + if index == 0: + arg_map["self"] = "at::randint(1 << 21, 1 << 30, {6, 6, 6}, at::kInt)" + arg_map["other"] = "at::randint(1, 22, {6, 6, 6}, at::kInt)" + else: + arg_map["self"] = "at::randint(1 << 21, 1 << 30, {22, 22, 22}, at::kInt)" + arg_map["other"] = "at::randint(1, 22, {22, 22, 22}, at::kInt)" + return + if op_name == "gather": + if index == 0: + arg_map["self"] = "at::randint(1, 100, {2,2,2}, at::kInt)" + arg_map["dim"] = "1" + arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)" + arg_map["sparse_grad"] = "false" + else: + arg_map["self"] = "at::randint(1, 100, {5,5,5}, at::kInt)" + arg_map["dim"] = "1" + arg_map["index"] = "at::randint(0, 4, {5,5,5}, torch::kInt64)" + arg_map["sparse_grad"] = "false" + return + if op_name == "gelu": + if index == 0: + arg_map["self"] = "at::rand({6, 6, 6})" + arg_map["approximate"] = '"tanh"' + else: + arg_map["self"] = "at::rand({22, 22, 22})" + arg_map["approximate"] = '"tanh"' + return + if op_name == "gelu_backward": + if index == 0: + arg_map["grad_output"] = "at::rand({6, 6, 6})" + arg_map["self"] = "at::rand({6, 6, 6})" + arg_map["approximate"] = '"tanh"' + else: + arg_map["grad_output"] = "at::rand({22, 22, 22})" + arg_map["self"] = "at::rand({22, 22, 22})" + arg_map["approximate"] = '"tanh"' + return + if op_name == "index_add": + if index == 0: + arg_map["self"] = "at::rand({2})" + arg_map["dim"] = "0" + arg_map["index"] = "at::randint(0, 1, {2}, at::kInt)" + arg_map["source"] = "at::rand({2})" + arg_map["alpha"] = "2" + else: + arg_map["self"] = "at::rand({16})" + arg_map["dim"] = "0" + arg_map["index"] = "at::randint(0, 10, {16}, at::kInt)" + arg_map["source"] = "at::rand({16})" + arg_map["alpha"] = "2" + return + if op_name == "index_copy": + if index == 0: + arg_map["self"] = "at::rand({2})" + arg_map["dim"] = "0" + arg_map["index"] = "at::randint(0, 1, {2}, at::kLong)" + arg_map["source"] = "at::rand({2})" + else: + arg_map["self"] = "at::rand({32})" + arg_map["dim"] = "0" + arg_map["index"] = "at::randint(0, 10, {32}, at::kLong)" + arg_map["source"] = "at::rand({32})" + return + if op_name == "linalg_cross": + if index == 0: + arg_map["self"] = "at::rand({6, 3, 6})" + arg_map["other"] = "at::rand({6, 3, 6})" + arg_map["dim"] = "1" + else: + arg_map["self"] = "at::rand({22, 3, 22})" + arg_map["other"] = "at::rand({22, 3, 22})" + arg_map["dim"] = "1" + return + if op_name == "nll_loss_backward": + if index == 0: + arg_map["grad_output"] = "at::rand({})" + arg_map["self"] = "at::rand({6})" + arg_map["target"] = "at::randint(0, 5, {6}, torch::kInt64)" + arg_map["weight"] = "at::rand({6})" + arg_map["reduction"] = "1" + arg_map["ignore_index"] = "1" + arg_map["total_weight"] = "at::rand({})" + else: + arg_map["grad_output"] = "at::rand({})" + arg_map["self"] = "at::rand({36})" + arg_map["target"] = "at::randint(0, 11, {36}, torch::kInt64)" + arg_map["weight"] = "at::rand({36})" + arg_map["reduction"] = "1" + arg_map["ignore_index"] = "1" + arg_map["total_weight"] = "at::rand({})" + return + if op_name in ["scatter", "scatter_add", "_scatter_reduce"]: + if index == 0: + arg_map["self"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)" + arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)" + arg_map["src"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)" + else: + arg_map["self"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)" + arg_map["index"] = "at::randint(0, 1, {5,5,5}, torch::kInt64)" + arg_map["src"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)" + if "reduce" in arg_map: + arg_map["reduce"] = '"sum"' if op_name == "_scatter_reduce" else '"add"' + return + if op_name == "scatter_reduce": + arg_map["reduce"] = '"mean"' + if index == 0: + arg_map["index"] = "at::randint(6, {6, 6, 6}, torch::kInt64)" + else: + arg_map["index"] = "at::randint(22, {22, 22, 22}, torch::kInt64)" + return + if op_name == "special_zeta": + if index == 0: + arg_map["self"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})" + arg_map["other"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})" + else: + arg_map["self"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})" + arg_map["other"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})" + return + if op_name == "_convert_indices_from_csr_to_coo": + if index == 0: + arg_map["crow_indices"] = "torch::tensor({1}, torch::kInt32)" + arg_map["col_indices"] = "torch::tensor({0, 1, 0}, torch::kInt32)" + arg_map["out_int32"] = "false" + else: + arg_map["crow_indices"] = "torch::tensor({0}, torch::kInt32)" + arg_map[ + "col_indices" + ] = "torch::tensor({0, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1, 2}, torch::kInt32)" + arg_map["out_int32"] = "false" + return + if op_name == "_convert_indices_from_coo_to_csr": + if index == 0: + arg_map["self"] = "at::randint(0, 3, {2}, at::kInt)" + arg_map["size"] = "10" + arg_map["out_int32"] = "false" + else: + arg_map["self"] = "at::randint(0, 3, {12}, at::kInt)" + arg_map["size"] = "24" + arg_map["out_int32"] = "false" + return + if op_name in ("diagonal", "linalg_diagonal"): + arg_map["offset"] = "0" + arg_map["dim1"] = "2" + arg_map["dim2"] = "1" + return diff --git a/torchgen/static_runtime/gen_static_runtime_ops.py b/torchgen/static_runtime/gen_static_runtime_ops.py new file mode 100644 index 00000000000..9f735717374 --- /dev/null +++ b/torchgen/static_runtime/gen_static_runtime_ops.py @@ -0,0 +1,229 @@ +from __future__ import annotations + +import argparse +import itertools +import os +from typing import Sequence, TypeVar, Union + +from libfb.py.log import set_simple_logging # type: ignore[import] + +from torchgen import gen +from torchgen.context import native_function_manager +from torchgen.model import DispatchKey, NativeFunctionsGroup, NativeFunctionsViewGroup +from torchgen.static_runtime import config, generator + + +# Given a list of `grouped_native_functions` sorted by their op names, return a list of +# lists each of which groups ops that share the base name. For example, `mean` and +# `mean.dim` are grouped together by this function. + +NativeGroupT = TypeVar( + "NativeGroupT", + bound=Union[NativeFunctionsGroup, NativeFunctionsViewGroup], +) + + +def group_functions_by_op_name( + grouped_native_functions: Sequence[NativeGroupT], +) -> Sequence[Sequence[NativeGroupT]]: + if not grouped_native_functions: + return [] + groups = [] + + def is_supported(g: NativeFunctionsGroup | NativeFunctionsViewGroup) -> bool: + with native_function_manager(g): + return generator.is_supported(g) + + eligible_ops = (g for g in grouped_native_functions if is_supported(g)) + groups = [ + list(group) + for k, group in ( + itertools.groupby( + eligible_ops, + key=config.func_name_base_str, + ) + ) + ] + + return groups + + +def clang_format(cpp_file_path: str) -> None: + import subprocess + + subprocess.check_call(["clang-format", "-i", cpp_file_path]) + + +def write_cpp(cpp_ops: Sequence[str], file_path: str) -> None: + code = "\n".join(cpp_ops) + generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN +// AUTO-GENERATED FROM: torchgen/static_runtime/gen_static_runtime_ops.py +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch {{ +namespace jit {{ + +{code} + +}} // namespace jit +}} // namespace torch +""" + with open(file_path, "w") as f: + f.write(generated) + clang_format(file_path) + + +def write_test_cpp(cpp_ops: Sequence[str], file_path: str) -> None: + code = "\n".join(cpp_ops) + generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN +// AUTO-GENERATED FROM: torchgen/static_runtime/gen_static_runtime_ops.py +#include +#include +#include + +#include "test_utils.h" + +using namespace caffe2; +using namespace torch; +using namespace torch::jit; +using namespace torch::jit::test; +using c10::IValue; + +{code} + +""" + with open(file_path, "w") as f: + f.write(generated) + clang_format(file_path) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate ATen source files") + parser.add_argument( + "-s", + "--source-path", + help="path to source directory for ATen", + default="caffe2/aten/src/ATen", + ) + parser.add_argument( + "-p", + "--generated-ops-cpp-path", + help="path to directory to generate op dispatcher .cpp file", + default="caffe2/torch/csrc/jit/runtime/static/generated_ops.cpp", + ) + parser.add_argument( + "-t", + "--generated-ops-test-cpp-path", + help="path to directory to generate op dispatcher .cpp file", + default="caffe2/benchmarks/static_runtime/test_generated_ops.cc", + ) + options = parser.parse_args() + native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml") + tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml") + parsed_yaml = gen.parse_native_yaml(native_yaml_path, tags_yaml_path) + native_functions, backend_indices = ( + parsed_yaml.native_functions, + parsed_yaml.backend_indices, + ) + + op_generator = generator.GenOpDispatcher() + test_case_generator = generator.GenOpTestCase() + + native_functions_groups = [ + g + for g in gen.get_grouped_native_functions(native_functions) + if isinstance(g, NativeFunctionsGroup) + ] + + supported_functions_groups = group_functions_by_op_name(native_functions_groups) + + out_variant_op_result = [ + op_generator.out_variant(groups, backend_indices[DispatchKey.CPU]) + for groups in supported_functions_groups + ] + out_variant_test_result = [ + test_case_generator.out_variant(groups) for groups in supported_functions_groups + ] + + native_functions_view_groups = [ + g + for g in gen.get_grouped_by_view_native_functions(native_functions) + if isinstance(g, NativeFunctionsViewGroup) + ] + + supported_functions_view_groups = group_functions_by_op_name( + native_functions_view_groups + ) + + view_op_result = [ + op_generator.view(groups, backend_indices[DispatchKey.CPU]) + for groups in supported_functions_view_groups + ] + view_test_result = [ + test_case_generator.view(groups) for groups in supported_functions_view_groups + ] + + op_result = out_variant_op_result + ["\n\n"] + view_op_result + test_result = out_variant_test_result + ["\n\n"] + view_test_result + + write_cpp(op_result, options.generated_ops_cpp_path) + write_test_cpp(test_result, options.generated_ops_test_cpp_path) + + print( + "\ntotal grouped native ops: %d" + % len(gen.get_grouped_native_functions(native_functions)) + ) + + print("grouped native ops with out variant: %d" % len(native_functions_groups)) + supported_functions_num = sum(len(groups) for groups in supported_functions_groups) + print("generated functions groups with out variant: %d" % supported_functions_num) + + print("\nview grouped native ops: %d" % len(native_functions_view_groups)) + supported_view_functions_num = sum( + len(groups) for groups in supported_functions_view_groups + ) + print("generated functions view groups: %d" % supported_view_functions_num) + + print( + "\noverall generated : %d" + % (supported_functions_num + supported_view_functions_num) + ) + + +if __name__ == "__main__": + set_simple_logging(escape_newlines=False) + main() diff --git a/torchgen/static_runtime/generator.py b/torchgen/static_runtime/generator.py new file mode 100644 index 00000000000..7bbb7f64d86 --- /dev/null +++ b/torchgen/static_runtime/generator.py @@ -0,0 +1,809 @@ +from __future__ import annotations + +import json +import logging +import math +from typing import Sequence + +import torchgen.api.cpp as cpp +from torchgen.context import native_function_manager +from torchgen.model import ( + Argument, + BackendIndex, + BaseTy, + BaseType, + FunctionSchema, + NativeFunctionsGroup, + NativeFunctionsViewGroup, + OptionalType, + SelfArgument, + TensorOptionsArguments, + Type, +) +from torchgen.static_runtime import config + + +logger: logging.Logger = logging.getLogger() + + +def has_alias( + arguments: Sequence[Argument | SelfArgument | TensorOptionsArguments], +) -> bool: + for arg in arguments: + annotation = getattr(arg, "annotation", None) + if not annotation: + continue + alias_set = getattr(annotation, "alias_set", ()) + if alias_set: + return True + return False + + +BLOCKED_OPS = frozenset( + ( + # non cpu ops + "sparse_sampled_addmm", + "hspmm", + "linalg_svdvals", + # sparse ops + "sspaddmm", + "coalesce", + "_indices", + "indices", + "_values", + "values", + "crow_indices", + "col_indices", + # deprecated ops + "floor_divide", + "ger", + # buggy ops + "conj_physical", # P495807361 + "binary_cross_entropy", # P496394764 + "arccosh", + # uncommon ops + "cholesky", + "lu_solve", + "linalg_cholesky", + "linalg_householder_product", + "linalg_ldl_solve", + "_compute_linear_combination", + # training related ops + "_make_dual", + # cannot call directly + "_fw_primal", + # no documentation + "_index_reduce", + # TODO: these ones got added recently and need manual inspection + "_new_zeros_with_same_feature_meta", + "_conj_physical", + "binary_cross_entropy_with_logits", + "bincount", + "conv_tbc", + "copy", + "_copy_from", + "_copy_from_and_resize", + "count_nonzero", + "cudnn_affine_grid_generator", + "cudnn_affine_grid_generator_backward", + "cudnn_grid_sampler", + "diag_embed", + "embedding", + "embedding_dense_backward", + "_embedding_bag_dense_backward", + "_embedding_bag_per_sample_weights_backward", + "grid_sampler_2d", + "_grid_sampler_2d_cpu_fallback", + "grid_sampler_3d", + "isnan", + "mkldnn_linear", + "median", + "nanmedian", + "_sparse_sparse_matmul", + "batch_norm_backward_elemt", + "_euclidean_dist", + "pixel_shuffle", + "pixel_unshuffle", + "channel_shuffle", + "_reshape_nested_backward", + "relu", + "prelu", + "celu", + "slice_scatter", + "select_scatter", + "diagonal_scatter", + "sum", + "_mkldnn_transpose", + "_nested_tensor_from_mask", + "_nested_from_padded", + "_nested_tensor_size", + "_nested_from_padded_and_nested_example", + "_standard_gamma_grad", + "_dirichlet_grad", + "native_norm", + "_sparse_softmax", + "_sparse_softmax_backward_data", + "_sparse_log_softmax", + "_sparse_log_softmax_backward_data", + "zero", + "_sparse_addmm", + "sparse_mask", + "_sparse_mask_projection", + "_to_dense", + "_coalesce", + "_coalesced", + "copy_sparse_to_sparse", + "to_sparse", + "to_sparse_csr", + "to_sparse_csc", + "to_mkldnn", + "quantize_per_tensor_dynamic", + "quantize_per_channel", + "q_per_channel_scales", + "q_per_channel_zero_points", + "int_repr", + "_make_per_channel_quantized_tensor", + "set", + "lift", + "lift_fresh", + "lift_fresh_copy", + "masked_scatter", + "_masked_softmax", + "_masked_softmax_backward", + "put", + "index_reduce", + "trace", + "_cholesky_solve_helper", + "dist", + "max", + "_torch_cuda_cu_linker_symbol_op", + "glu_jvp", + "glu_backward_jvp", + "hardswish_backward", + "rrelu_with_noise_backward", + "mkldnn_adaptive_avg_pool2d_backward", + "_adaptive_avg_pool2d_backward", + "_adaptive_avg_pool3d_backward", + "isinf", + "linalg_lu_solve", + "linalg_vecdot", + "linalg_matrix_exp", + "linalg_eigvalsh", + "_test_warn_in_autograd", + "_test_autograd_multiple_dispatch_view", + "_test_autograd_multiple_dispatch_view_copy", + "_segment_reduce", + "_segment_reduce_backward", + "_fw_primal_copy", + "_make_dual_copy", + "view_as_real_copy", + "view_as_complex_copy", + "_conj_copy", + "_neg_view_copy", + "diagonal_copy", + "detach_copy", + "squeeze_copy", + "t_copy", + "unsqueeze_copy", + "_indices_copy", + "_values_copy", + "indices_copy", + "values_copy", + "crow_indices_copy", + "col_indices_copy", + "ccol_indices", + "ccol_indices_copy", + "row_indices", + "row_indices_copy", + "unfold_copy", + "alias_copy", + "_triton_multi_head_attention", + "special_airy_ai", + "special_bessel_j0", + "special_bessel_j1", + "special_bessel_y0", + "special_bessel_y1", + "special_chebyshev_polynomial_t", + "special_chebyshev_polynomial_u", + "special_chebyshev_polynomial_v", + "special_chebyshev_polynomial_w", + "special_hermite_polynomial_h", + "special_hermite_polynomial_he", + "special_laguerre_polynomial_l", + "special_legendre_polynomial_p", + "special_modified_bessel_i0", + "special_modified_bessel_i1", + "special_modified_bessel_k0", + "special_modified_bessel_k1", + "special_scaled_modified_bessel_k0", + "special_scaled_modified_bessel_k1", + "special_shifted_chebyshev_polynomial_t", + "special_shifted_chebyshev_polynomial_u", + "special_shifted_chebyshev_polynomial_v", + "special_shifted_chebyshev_polynomial_w", + "special_spherical_bessel_j0", + "_foobar", + "_nested_tensor_strides", + "_nested_tensor_storage_offsets", + "_nested_get_values", # no CPU backend + "_nested_get_values_copy", # no CPU backend + "_nested_view_from_jagged", # testing needs to be patched + "_nested_view_from_jagged_copy", # testing needs to be patched + "_nested_view_from_buffer", # testing needs to be patched + "_nested_view_from_buffer_copy", # testing needs to be patched + "_int_mm", # testing needs to be patched + "_to_sparse_csc", # testing needs to be patched + "_to_sparse_csr", # testing needs to be patched + "segment_reduce", # testing needs to be patched + ) +) + + +def is_supported(g: NativeFunctionsGroup | NativeFunctionsViewGroup) -> bool: + base_op_name = "" + func = None + if isinstance(g, NativeFunctionsViewGroup): + base_op_name = g.view.root_name + func = g.view.func + else: + base_op_name = g.out.func.name.name.base + func = g.out.func + if config.is_hand_written(g): + logger.info("HAND WRITTEN: %s", base_op_name) + return False + if base_op_name in BLOCKED_OPS: + logger.info("BLOCKED: %s", base_op_name) + return False + for arg in func.schema_order_arguments(): + maybe_method = ivalue_type_conversion_method(arg.type) + if not maybe_method: + # Type converting is unsupported yet. + logger.info("NOT SUPPORTED TYPE CONVERTING: %s", func) + return False + + if isinstance(g, NativeFunctionsViewGroup): + # TODO: stop doing type tests by converting to C++ and then testing + # the string, just test the dang thing directly + if "at::Tensor" != cpp.returns_type(func.returns, symint=False).cpp_type(): + # Returns a non-Tensor value. + logger.info("NON-TENSOR RET TYPE: %s", str(func)) + return False + return True + + # For out variant ops, we need to check the arguments of its functional func. + for arg in g.functional.func.schema_order_arguments(): + maybe_method = ivalue_type_conversion_method(arg.type) + if not maybe_method: + # Type converting is unsupported yet. + logger.info("NOT SUPPORTED TYPE CONVERTING: %s", g.functional.func) + return False + + if not g.structured: + # In case of unstructured op, we check if it has out variant implementation. + # The out variant implementation satisfies the minimum requirement that it has the output tensor as the last + # parameter. + if ( + not hasattr(g, "out") + or not str(func).endswith("Tensor(a!) out) -> Tensor(a!)") + or not str(func.name).endswith(".out") + ): + return False + # TODO: stop type testing by converting to C++ + if "at::Tensor &" != cpp.returns_type(func.returns, symint=False).cpp_type(): + logger.info("NON_TENSOR RET TYPE: %s", func) + return False + if has_alias(func.arguments.non_out): + # This op may create an alias of inputs. + logger.info("INPUTS ALIAS: %s", base_op_name) + return False + return True + + +def ivalue_type_conversion_method( + arg_type: BaseType | OptionalType | Type, +) -> tuple[bool, str] | None: + """ + Return the method call expression of `c10::ivalue' to convert its contained value to + the expected value of `arg_type` type. For example, for `arg_type` == BaseTy.Tensor, + this function returns ".toTensor()", so that it can be appended to the ivalue's + variable name to get the value of the expected type. + """ + type_conversion_methods = { + BaseTy.Tensor: ((True, "toTensor()"), (False, "toOptional()")), + BaseTy.int: ((False, "toInt()"), (False, "toOptional()")), + BaseTy.bool: ((False, "toBool()"), (False, "toOptional()")), + BaseTy.Scalar: ((False, "toScalar()"), (False, "toOptional()")), + BaseTy.ScalarType: ( + (False, "toScalarType()"), + (False, "toOptional()"), + ), + BaseTy.str: ( + (False, "toStringView()"), + (False, "toOptional()"), + ), + } + + base_ty_object = None + if isinstance(arg_type, BaseType): + base_ty_object = arg_type.name + elif isinstance(arg_type, OptionalType): + if not isinstance(arg_type.elem, BaseType): + # ListType is currently unsupported. + return None + base_ty_object = arg_type.elem.name + else: + return None + + if base_ty_object not in type_conversion_methods: + return None + methods = type_conversion_methods[base_ty_object] + if isinstance(arg_type, BaseType): + return methods[0] + return methods[1] + + +should_use_int_tensor_ops_ = frozenset( + ( + "bitwise_not", + "bitwise_and", + "bitwise_or", + "bitwise_xor", + "bitwise_left_shift", + "bitwise_right_shift", + "gcd", + "lcm", + "scatter", + "gather", + "_convert_indices_from_coo_to_csr", + "_convert_indices_from_csr_to_coo", + ) +) +should_use_complex_tensor_ops_ = frozenset(("view_as_real", "imag", "_conj")) + + +def should_use_int_tensor(op_name: str) -> bool: + return op_name in should_use_int_tensor_ops_ + + +def should_use_complex_tensor(op_name: str) -> bool: + return op_name in should_use_complex_tensor_ops_ + + +test_tensor_dim_ops_1_ = frozenset( + ( + "addmv", + "index_add", + "_convert_indices_from_coo_to_csr", + "_convert_indices_from_csr_to_coo", + "nll_loss_backward", + "dot", + "vdot", + "outer", + "ger", + ) +) +test_tensor_dim_ops_2_ = frozenset( + ("addmm", "mm", "nuclear_norm", "diag", "_addmm_activation", "matrix_H", "t") +) + + +def test_tensor_dim(op_name: str) -> int: + if op_name in test_tensor_dim_ops_1_: + return 1 + if op_name in test_tensor_dim_ops_2_: + return 2 + return 3 + + +test_tensor_shapes_string = '{"view_as_complex": "{2, 2}"}' +test_tensor_shape_json: dict[str, str] = json.loads(test_tensor_shapes_string) + + +def test_tensor_shape(op_name: str) -> str: + if op_name in test_tensor_shape_json: + return test_tensor_shape_json[op_name] + else: + return "" + + +def test_value_expression( + arg_type: BaseType | OptionalType | Type, index: int, op_name: str +) -> str: + tensor_size_ex = test_tensor_shape(op_name) + if tensor_size_ex == "": + num_tensors = 16 if index == 0 else 64 + num_dim = test_tensor_dim(op_name) + size_per_dim = math.ceil(num_tensors / float(num_dim)) + size_per_dim += size_per_dim % 2 + tensor_size_ex = "{{{}}}".format(",".join([f"{size_per_dim}"] * num_dim)) + if should_use_int_tensor(op_name): + tensor_expression = f"at::randint(1, 100, {tensor_size_ex}, at::kInt)" + elif should_use_complex_tensor(op_name): + tensor_expression = f"at::randn({tensor_size_ex}, at::kComplexFloat)" + else: + tensor_expression = f"at::rand({tensor_size_ex})" + + value_expressions = { + BaseTy.Tensor: tensor_expression, + BaseTy.int: "1", + BaseTy.bool: "false", + BaseTy.Scalar: "2", + BaseTy.ScalarType: "at::ScalarType::Float", + BaseTy.str: '"floor"', + } + + base_ty_object = None + if isinstance(arg_type, BaseType): + base_ty_object = arg_type.name + else: + assert isinstance(arg_type, OptionalType) and isinstance( + arg_type.elem, BaseType + ) + base_ty_object = arg_type.elem.name + assert base_ty_object in value_expressions, "not expected type" + value_expression = value_expressions[base_ty_object] + return value_expression + + +def generate_test_value_definitions(schema: FunctionSchema, index: int) -> str: + assert not schema.is_out_fn() + schema_name = schema.name.name.base + arg_map = {} + for arg in schema.schema_order_arguments(): + test_value_exp = test_value_expression(arg.type, index, schema_name) + arg_map[arg.name] = test_value_exp + config.override_test_values(arg_map, schema_name, index) + arg_populations = [] + for arg_name, arg_value in arg_map.items(): + arg_populations.append(f"auto {arg_name}{index} = {arg_value}") + return ";\n ".join(arg_populations) + ";" + + +def generate_test_value_names(schema: FunctionSchema, index: int) -> str: + assert not schema.is_out_fn() + return ",".join(f"{arg.name}{index}" for arg in schema.schema_order_arguments()) + + +generate_test_ir_arguments_base_ty_to_type_str_ = { + BaseTy.Tensor: "Tensor", + BaseTy.int: "int", + BaseTy.float: "float", + BaseTy.str: "str", + BaseTy.Scalar: "int", + BaseTy.ScalarType: "int", + BaseTy.bool: "bool", +} + + +def generate_test_ir_arguments( + schema: FunctionSchema, +) -> list[tuple[str, str | None]]: + def ir_argument(arg: Argument) -> tuple[str, str | None]: + t = arg.type + add_optional = False + if isinstance(t, OptionalType): + t = t.elem + add_optional = True + assert isinstance(t, BaseType) + type_str = None + if t.name in generate_test_ir_arguments_base_ty_to_type_str_: + type_str = generate_test_ir_arguments_base_ty_to_type_str_[t.name] + if type_str and add_optional: + type_str = f"{type_str}?" + return ("%" + arg.name, type_str) + + return [ir_argument(arg) for arg in schema.schema_order_arguments()] + + +def generate_arg_extraction(schema: FunctionSchema) -> str: + arg_populations = [] + for i, arg in enumerate(schema.schema_order_arguments()): + maybe_method = ivalue_type_conversion_method(arg.type) + assert maybe_method + is_reference, type_conversion_method = maybe_method + reference = "&" if is_reference else "" + arg_populations.append( + f"const auto{reference} {arg.name} = p_node->Input({i}).{type_conversion_method}" + ) + return ";\n ".join(arg_populations) + ";" + + +def get_kernel_name(g: NativeFunctionsGroup, backend_index: BackendIndex) -> str: + kernel = backend_index.get_kernel(g.functional) + if g.structured or kernel is None: + return cpp.name(g.functional.func) + return kernel.kernel + + +def get_out_kernel_name(g: NativeFunctionsGroup, backend_index: BackendIndex) -> str: + kernel = backend_index.get_kernel(g.out) + if g.structured or kernel is None: + return cpp.name(g.out.func) + return kernel.kernel + + +def generate_non_out_variant_call( + g: NativeFunctionsGroup, backend_index: BackendIndex +) -> str: + schema = g.functional.func + assert not schema.is_out_fn() + kernel_name = get_kernel_name(g, backend_index) + arg_names = (arg.name for arg in schema.schema_order_arguments()) + namespace_name = "cpu" if g.structured else "native" + return f'at::{namespace_name}::{kernel_name}({",".join(arg_names)})' + + +def generate_call_to_view_ops( + g: NativeFunctionsViewGroup, backend_index: BackendIndex +) -> str: + schema = g.view.func + kernel_name = cpp.name(schema) + kernel = backend_index.get_kernel(g.view) + if kernel: + kernel_name = kernel.kernel + arg_names = (arg.name for arg in schema.schema_order_arguments()) + namespace_name = "native" + return f'at::{namespace_name}::{kernel_name}({",".join(arg_names)})' + + +def generate_out_variant_call( + g: NativeFunctionsGroup, backend_index: BackendIndex +) -> str: + schema = g.out.func + assert schema.is_out_fn() + arg_names = [] + kernel_name = get_out_kernel_name(g, backend_index) + if g.structured: + # structured op starts with the output tensor argument. + arg_names = [out_arg.name for out_arg in schema.arguments.out] + else: + arg_names = [] + for arg in schema.arguments.non_out: + if isinstance(arg, SelfArgument): + arg_names.append(arg.argument.name) + else: + assert isinstance(arg, Argument) + arg_names.append(arg.name) + if not g.structured: + assert len(schema.arguments.out) == 1 + arg_names.append(schema.arguments.out[0].name) + cpp_arg_names = ",".join(arg_names) + namespace_name = "cpu" if g.structured else "native" + return f"at::{namespace_name}::{kernel_name}({cpp_arg_names})" + + +no_memory_resize_ops = frozenset( + ( + "isin.Scalar_Tensor", + "index_add", + "dot", + "vdot", + "nuclear_norm", + "histc", + "l1_loss", + "multi_margin_loss", + "multilabel_margin_loss", + "nll_loss", + "nll_loss2d", + "prod", + ) +) + + +def should_check_resize(schema: FunctionSchema) -> bool: + schema_str = str(schema) + type_variant_op_name = schema_str[: schema_str.find("(")] + return type_variant_op_name not in no_memory_resize_ops + + +def op_name_from_group(g: NativeFunctionsGroup) -> str: + return g.functional.func.name.name.base + + +class GenOpDispatcher: + def out_variant( + self, groups: Sequence[NativeFunctionsGroup], backend_index: BackendIndex + ) -> str: + if not groups: + return "" + generated_type_variants = [] + for g in groups: + with native_function_manager(g): + assert is_supported(g) + assert isinstance(g, NativeFunctionsGroup) + generated_type_variant = self.out_variant_op_generator(g, backend_index) + generated_type_variants.append(generated_type_variant) + op_name = op_name_from_group(groups[0]) + body = "\n".join(generated_type_variants) + generated = f""" +REGISTER_OPERATOR_FUNCTOR( + aten::{op_name}, + aten_{op_name}, + [](Node* n) -> SROperator {{ + {body} + LogAndDumpSchema(n); + return nullptr; + }}); +""" + return generated + + def view( + self, groups: Sequence[NativeFunctionsViewGroup], backend_index: BackendIndex + ) -> str: + if not groups: + return "" + generated_type_variants = [] + for g in groups: + with native_function_manager(g): + assert is_supported(g) + assert isinstance(g, NativeFunctionsViewGroup) + generated_type_variant = self.view_op_generator(g, backend_index) + generated_type_variants.append(generated_type_variant) + op_name = config.func_name_base_str(groups[0]) + body = "\n".join(generated_type_variants) + generated = f""" +REGISTER_NATIVE_OPERATOR_FUNCTOR( + aten::{op_name}, + aten_{op_name}, + [](Node* n) -> SROperator {{ + {body} + LogAndDumpSchema(n); + return nullptr; + }}); +""" + return generated + + def out_variant_op_generator( + self, g: NativeFunctionsGroup, backend_index: BackendIndex + ) -> str: + functional = g.functional + schema = str(functional.func) + populated_argument = generate_arg_extraction(g.functional.func) + functional_variant_call = generate_non_out_variant_call(g, backend_index) + assert len(g.out.func.arguments.out) == 1 + out_variable_name = str(g.out.func.arguments.out[0].name) + out_variant_call = generate_out_variant_call(g, backend_index) + generated = f""" + if (n->matches(torch::schema("aten::{schema}"))) {{ + return [](ProcessedNode* p_node) {{ + {populated_argument} + if (p_node->Output(0).isNone()) {{ + p_node->Output(0) = {functional_variant_call}; + return; + }} + auto& {out_variable_name} = p_node->Output(0).toTensor(); + fastResizeToZero({out_variable_name}); + {out_variant_call}; + }}; + }}""" + return generated + + def view_op_generator( + self, g: NativeFunctionsViewGroup, backend_index: BackendIndex + ) -> str: + schema = str(g.view.func) + populated_argument = generate_arg_extraction(g.view.func) + functional_variant_call = generate_call_to_view_ops(g, backend_index) + generated = f""" + if (n->matches(torch::schema("aten::{schema}"))) {{ + return [](ProcessedNode* p_node) {{ + {populated_argument} + p_node->Output(0) = {functional_variant_call}; + }}; + }}""" + return generated + + +class GenOpTestCase: + def out_variant(self, groups: Sequence[NativeFunctionsGroup]) -> str: + if not groups: + return "" + generated_type_variants = [] + for g in groups: + with native_function_manager(g): + assert is_supported(g) + assert isinstance(g, NativeFunctionsGroup) + generated_type_variant = self.out_variant_op_test_case_generator(g) + generated_type_variants.append(generated_type_variant) + return "\n".join(generated_type_variants) + + def view(self, groups: Sequence[NativeFunctionsViewGroup]) -> str: + if not groups: + return "" + generated_type_variants = [] + for g in groups: + with native_function_manager(g): + assert is_supported(g) + assert isinstance(g, NativeFunctionsViewGroup) + generated_type_variant = self.view_op_test_case_generator(g) + generated_type_variants.append(generated_type_variant) + return "\n".join(generated_type_variants) + + def out_variant_op_test_case_generator(self, g: NativeFunctionsGroup) -> str: + schema = g.functional.func + schema_str = str(schema) + assert schema_str.find("(") > 0 + type_variant_op_name = schema_str[: schema_str.find("(")].replace(".", "_") + op_name = op_name_from_group(g) + assert type_variant_op_name.startswith(op_name) + + arg_types = generate_test_ir_arguments(schema) + arg_declarations = ", ".join( + ( + arg_name if arg_type is None else f"{arg_name}: {arg_type}" + for arg_name, arg_type in arg_types + ) + ) + arg_names = ", ".join((arg_name for arg_name, _ in arg_types)) + assert ( + len(schema.returns) == 1 + and isinstance(schema.returns[0].type, BaseType) + and schema.returns[0].type.name is BaseTy.Tensor + ) + test_value_definitions = generate_test_value_definitions(schema, 0) + test_value_names = generate_test_value_names(schema, 0) + test_value_definitions2 = generate_test_value_definitions(schema, 1) + test_value_names2 = generate_test_value_names(schema, 1) + check_resize = "true" if should_check_resize(schema) else "false" + generated = f""" +TEST(StaticRuntime, autogen_{type_variant_op_name}) {{ + const std::string script = R"IR( + graph({arg_declarations}): + %bias: None = prim::Constant() + %ret = aten::{op_name}({arg_names}) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + {test_value_definitions} + std::vector args{{{test_value_names}}}; + testStaticRuntime(script, args, {{}}, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize}); + + {test_value_definitions2} + std::vector args2{{{test_value_names2}}}; + testStaticRuntime(script, args, args2, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize}); + +}} +""" + return generated + + def view_op_test_case_generator(self, g: NativeFunctionsViewGroup) -> str: + schema = g.view.func + schema_str = str(schema) + assert schema_str.find("(") > 0 + type_variant_op_name = schema_str[: schema_str.find("(")].replace(".", "_") + op_name = g.view.root_name + assert type_variant_op_name.startswith(op_name) + + arg_types = generate_test_ir_arguments(schema) + arg_declarations = ", ".join( + ( + arg_name if arg_type is None else f"{arg_name}: {arg_type}" + for arg_name, arg_type in arg_types + ) + ) + arg_names = ", ".join((arg_name for arg_name, _ in arg_types)) + assert ( + len(schema.returns) == 1 + and isinstance(schema.returns[0].type, BaseType) + and schema.returns[0].type.name is BaseTy.Tensor + ) + test_value_definitions = generate_test_value_definitions(schema, 0) + test_value_names = generate_test_value_names(schema, 0) + generated = f""" +TEST(StaticRuntime, autogen_{type_variant_op_name}) {{ + const std::string script = R"IR( + graph({arg_declarations}): + %bias: None = prim::Constant() + %ret = aten::{op_name}({arg_names}) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + {test_value_definitions} + std::vector args{{{test_value_names}}}; + testStaticRuntime(script, args); +}} +""" + + return generated diff --git a/torchgen/tags.yaml b/torchgen/tags.yaml new file mode 100644 index 00000000000..c3172172903 --- /dev/null +++ b/torchgen/tags.yaml @@ -0,0 +1,65 @@ +# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml` + +- tag: inplace_view + desc: | + This tag indicates if an operator *only* modifies the tensor metadata +- tag: pt2_compliant_tag + desc: | + This tag indicates if the operator is guaranteed to + work with the PT2 compilation APIs (torch.compile, + torch.export, etc). If you add this tag to an + operator, please use + `torch.testing._internal.optest.opcheck` to test that + the operator has been registered correctly and + works with torch.compile +- tag: view_copy + desc: | + This tag indicates operators that are *_copy* variants + of view/aliasing operators. If an operator has a view_copy tag, + then it should have the name {op}_copy, where {op} is a view operator. +- tag: dynamic_output_shape + desc: | + This tag indicates if an operator's output's shape depends on input Tensor + data. +- tag: data_dependent_output + desc: | + Operator has a non-Tensor output whose value is dependent on the data + of Tensor inputs. Among other things, this implies that this operator + cannot be run with meta tensor (since data is not available), nor + can it be symbolically traced. +- tag: generated + desc: | + This tag indicates that the operator doesn't have an explicit entry in + native_functions.yaml, and instead was generated automatically by the codegen. +- tag: nondeterministic_seeded + desc: | + This tag indicates if an operator is nondeterministically seeded + (i.e., is random) such that the operator intentionally produces + different results when run twice on the same inputs, but this randomness + is controlled by a Generator which, if reseeded would give you the + same result. +- tag: nondeterministic_bitwise + desc: | + This tag indicates if an operator doesn't guarantee bitwise equivalence + across different runs of an operator with identical inputs. +- tag: needs_fixed_stride_order + desc: | + This tag indicates that the operator should be passed Tensors following + the same stride permutation as observed in eager when compiled in inductor. + +# NOTE [Core ATen Ops] +- tag: core + desc: | + Core aten ops is a subset of aten ops that remains after aten-to-aten decomposition and + functionalization pass. Core aten ops are fully functional and adhere to single static + assignment (SSA): this implies there will be no `inplace` or `_out` variants in this opset. + This opset is designed to serve as the functional IR to interface with compiler backends. + In contrast to primTorch, core aten opset doesn't decompose ops into explicit + type promotion and broadcasting ops. + Core aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True), + and thus can be used as an opset for export purpose. +- tag: pointwise + desc: | + Pointwise operators are operators where each element of the output is computed only by accessing + the corresponding element of all the broadcasted inputs. The output shape will be the broadcasted + shape of the inputs. diff --git a/torchgen/utils.py b/torchgen/utils.py new file mode 100644 index 00000000000..6d83a27dc9e --- /dev/null +++ b/torchgen/utils.py @@ -0,0 +1,519 @@ +from __future__ import annotations + +import contextlib +import functools +import hashlib +import os +import re +import sys +import textwrap +from dataclasses import fields, is_dataclass +from enum import auto, Enum +from pathlib import Path +from typing import ( + Any, + Callable, + Generic, + Iterable, + Iterator, + Literal, + NoReturn, + Sequence, + TYPE_CHECKING, + TypeVar, +) +from typing_extensions import Self + +from torchgen.code_template import CodeTemplate + + +if TYPE_CHECKING: + from argparse import Namespace + + +REPO_ROOT = Path(__file__).absolute().parent.parent + + +# Many of these functions share logic for defining both the definition +# and declaration (for example, the function signature is the same), so +# we organize them into one function that takes a Target to say which +# code we want. +# +# This is an OPEN enum (we may add more cases to it in the future), so be sure +# to explicitly specify with Literal[Target.XXX] or Literal[Target.XXX, Target.YYY] +# what targets are valid for your use. +class Target(Enum): + # top level namespace (not including at) + DEFINITION = auto() + DECLARATION = auto() + # TORCH_LIBRARY(...) { ... } + REGISTRATION = auto() + # namespace { ... } + ANONYMOUS_DEFINITION = auto() + # namespace cpu { ... } + NAMESPACED_DEFINITION = auto() + NAMESPACED_DECLARATION = auto() + + +# Matches "foo" in "foo, bar" but not "foobar". Used to search for the +# occurrence of a parameter in the derivative formula +IDENT_REGEX = r"(^|\W){}($|\W)" + + +# TODO: Use a real parser here; this will get bamboozled +def split_name_params(schema: str) -> tuple[str, list[str]]: + m = re.match(r"(\w+)(\.\w+)?\((.*)\)", schema) + if m is None: + raise RuntimeError(f"Unsupported function schema: {schema}") + name, _, params = m.groups() + return name, params.split(", ") + + +T = TypeVar("T") +S = TypeVar("S") + +# These two functions purposely return generators in analogy to map() +# so that you don't mix up when you need to list() them + + +# Map over function that may return None; omit Nones from output sequence +def mapMaybe(func: Callable[[T], S | None], xs: Iterable[T]) -> Iterator[S]: + for x in xs: + r = func(x) + if r is not None: + yield r + + +# Map over function that returns sequences and cat them all together +def concatMap(func: Callable[[T], Sequence[S]], xs: Iterable[T]) -> Iterator[S]: + for x in xs: + yield from func(x) + + +# Conveniently add error context to exceptions raised. Lets us +# easily say that an error occurred while processing a specific +# context. +@contextlib.contextmanager +def context(msg_fn: Callable[[], str]) -> Iterator[None]: + try: + yield + except Exception as e: + # TODO: this does the wrong thing with KeyError + msg = msg_fn() + msg = textwrap.indent(msg, " ") + msg = f"{e.args[0]}\n{msg}" if e.args else msg + e.args = (msg,) + e.args[1:] + raise + + +# A little trick from https://github.com/python/mypy/issues/6366 +# for getting mypy to do exhaustiveness checking +# TODO: put this somewhere else, maybe +def assert_never(x: NoReturn) -> NoReturn: + raise AssertionError(f"Unhandled type: {type(x).__name__}") + + +@functools.lru_cache(maxsize=None) +def _read_template(template_fn: str) -> CodeTemplate: + return CodeTemplate.from_file(template_fn) + + +# String hash that's stable across different executions, unlike builtin hash +def string_stable_hash(s: str) -> int: + sha1 = hashlib.sha1(s.encode("latin1")).digest() + return int.from_bytes(sha1, byteorder="little") + + +# A small abstraction for writing out generated files and keeping track +# of what files have been written (so you can write out a list of output +# files) +class FileManager: + install_dir: str + template_dir: str + dry_run: bool + filenames: set[str] + + def __init__(self, install_dir: str, template_dir: str, dry_run: bool) -> None: + self.install_dir = install_dir + self.template_dir = template_dir + self.filenames = set() + self.dry_run = dry_run + + def _write_if_changed(self, filename: str, contents: str) -> None: + old_contents: str | None + try: + with open(filename) as f: + old_contents = f.read() + except OSError: + old_contents = None + if contents != old_contents: + # Create output directory if it doesn't exist + os.makedirs(os.path.dirname(filename), exist_ok=True) + with open(filename, "w") as f: + f.write(contents) + + # Read from template file and replace pattern with callable (type could be dict or str). + def substitute_with_template( + self, template_fn: str, env_callable: Callable[[], str | dict[str, Any]] + ) -> str: + template_path = os.path.join(self.template_dir, template_fn) + env = env_callable() + if isinstance(env, dict): + if "generated_comment" not in env: + generator_default = REPO_ROOT / "torchgen" / "gen.py" + try: + generator = Path( + sys.modules["__main__"].__file__ or generator_default + ).absolute() + except (KeyError, AttributeError): + generator = generator_default.absolute() + + try: + generator_path = generator.relative_to(REPO_ROOT).as_posix() + except ValueError: + generator_path = generator.name + + env = { + **env, # copy the original dict instead of mutating it + "generated_comment": ( + "@" + f"generated by {generator_path} from {template_fn}" + ), + } + template = _read_template(template_path) + return template.substitute(env) + elif isinstance(env, str): + return env + else: + assert_never(env) + + def write_with_template( + self, + filename: str, + template_fn: str, + env_callable: Callable[[], str | dict[str, Any]], + ) -> None: + filename = f"{self.install_dir}/{filename}" + assert filename not in self.filenames, "duplicate file write {filename}" + self.filenames.add(filename) + if not self.dry_run: + substitute_out = self.substitute_with_template( + template_fn=template_fn, + env_callable=env_callable, + ) + self._write_if_changed(filename=filename, contents=substitute_out) + + def write( + self, + filename: str, + env_callable: Callable[[], str | dict[str, Any]], + ) -> None: + self.write_with_template(filename, filename, env_callable) + + def write_sharded( + self, + filename: str, + items: Iterable[T], + *, + key_fn: Callable[[T], str], + env_callable: Callable[[T], dict[str, list[str]]], + num_shards: int, + base_env: dict[str, Any] | None = None, + sharded_keys: set[str], + ) -> None: + everything: dict[str, Any] = {"shard_id": "Everything"} + shards: list[dict[str, Any]] = [ + {"shard_id": f"_{i}"} for i in range(num_shards) + ] + all_shards = [everything] + shards + + if base_env is not None: + for shard in all_shards: + shard.update(base_env) + + for key in sharded_keys: + for shard in all_shards: + if key in shard: + assert isinstance( + shard[key], list + ), "sharded keys in base_env must be a list" + shard[key] = shard[key].copy() + else: + shard[key] = [] + + def merge_env(into: dict[str, list[str]], from_: dict[str, list[str]]) -> None: + for k, v in from_.items(): + assert k in sharded_keys, f"undeclared sharded key {k}" + into[k] += v + + if self.dry_run: + # Dry runs don't write any templates, so incomplete environments are fine + items = () + + for item in items: + key = key_fn(item) + sid = string_stable_hash(key) % num_shards + env = env_callable(item) + + merge_env(shards[sid], env) + merge_env(everything, env) + + dot_pos = filename.rfind(".") + if dot_pos == -1: + dot_pos = len(filename) + base_filename = filename[:dot_pos] + extension = filename[dot_pos:] + + for shard in all_shards: + shard_id = shard["shard_id"] + self.write_with_template( + f"{base_filename}{shard_id}{extension}", filename, lambda: shard + ) + + # filenames is used to track compiled files, but FooEverything.cpp isn't meant to be compiled + self.filenames.discard( + f"{self.install_dir}/{base_filename}Everything{extension}" + ) + + def write_outputs(self, variable_name: str, filename: str) -> None: + """Write a file containing the list of all outputs which are + generated by this script.""" + content = "set({}\n {})".format( + variable_name, + "\n ".join('"' + name + '"' for name in sorted(self.filenames)), + ) + self._write_if_changed(filename, content) + + def template_dir_for_comments(self) -> str: + """ + This needs to be deterministic. The template dir is an absolute path + that varies across builds. So, just use the path relative to this file, + which will point to the codegen source but will be stable. + """ + return os.path.relpath(self.template_dir, os.path.dirname(__file__)) + + +# Helper function to generate file manager +def make_file_manager( + options: Namespace, install_dir: str | None = None +) -> FileManager: + template_dir = os.path.join(options.source_path, "templates") + install_dir = install_dir if install_dir else options.install_dir + return FileManager( + install_dir=install_dir, template_dir=template_dir, dry_run=options.dry_run + ) + + +# Helper function to create a pretty representation for dataclasses +def dataclass_repr( + obj: Any, + indent: int = 0, + width: int = 80, +) -> str: + # built-in pprint module support dataclasses from python 3.10 + if sys.version_info >= (3, 10): + from pprint import pformat + + return pformat(obj, indent, width) + + return _pformat(obj, indent=indent, width=width) + + +def _pformat( + obj: Any, + indent: int, + width: int, + curr_indent: int = 0, +) -> str: + assert is_dataclass(obj), f"obj should be a dataclass, received: {type(obj)}" + + class_name = obj.__class__.__name__ + # update current indentation level with class name + curr_indent += len(class_name) + 1 + + fields_list = [(f.name, getattr(obj, f.name)) for f in fields(obj) if f.repr] + + fields_str = [] + for name, attr in fields_list: + # update the current indent level with the field name + # dict, list, set and tuple also add indent as done in pprint + _curr_indent = curr_indent + len(name) + 1 + if is_dataclass(attr): + str_repr = _pformat(attr, indent, width, _curr_indent) + elif isinstance(attr, dict): + str_repr = _format_dict(attr, indent, width, _curr_indent) + elif isinstance(attr, (list, set, tuple)): + str_repr = _format_list(attr, indent, width, _curr_indent) + else: + str_repr = repr(attr) + + fields_str.append(f"{name}={str_repr}") + + indent_str = curr_indent * " " + body = f",\n{indent_str}".join(fields_str) + return f"{class_name}({body})" + + +def _format_dict( + attr: dict[Any, Any], + indent: int, + width: int, + curr_indent: int, +) -> str: + curr_indent += indent + 3 + dict_repr = [] + for k, v in attr.items(): + k_repr = repr(k) + v_str = ( + _pformat(v, indent, width, curr_indent + len(k_repr)) + if is_dataclass(v) + else repr(v) + ) + dict_repr.append(f"{k_repr}: {v_str}") + + return _format(dict_repr, indent, width, curr_indent, "{", "}") + + +def _format_list( + attr: list[Any] | set[Any] | tuple[Any, ...], + indent: int, + width: int, + curr_indent: int, +) -> str: + curr_indent += indent + 1 + list_repr = [ + _pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l) + for l in attr + ] + start, end = ("[", "]") if isinstance(attr, list) else ("(", ")") + return _format(list_repr, indent, width, curr_indent, start, end) + + +def _format( + fields_str: list[str], + indent: int, + width: int, + curr_indent: int, + start: str, + end: str, +) -> str: + delimiter, curr_indent_str = "", "" + # if it exceed the max width then we place one element per line + if len(repr(fields_str)) >= width: + delimiter = "\n" + curr_indent_str = " " * curr_indent + + indent_str = " " * indent + body = f", {delimiter}{curr_indent_str}".join(fields_str) + return f"{start}{indent_str}{body}{end}" + + +class NamespaceHelper: + """A helper for constructing the namespace open and close strings for a nested set of namespaces. + + e.g. for namespace_str torch::lazy, + + prologue: + namespace torch { + namespace lazy { + + epilogue: + } // namespace lazy + } // namespace torch + """ + + def __init__( + self, namespace_str: str, entity_name: str = "", max_level: int = 2 + ) -> None: + # cpp_namespace can be a colon joined string such as torch::lazy + cpp_namespaces = namespace_str.split("::") + assert ( + len(cpp_namespaces) <= max_level + ), f"Codegen doesn't support more than {max_level} level(s) of custom namespace. Got {namespace_str}." + self.cpp_namespace_ = namespace_str + self.prologue_ = "\n".join([f"namespace {n} {{" for n in cpp_namespaces]) + self.epilogue_ = "\n".join( + [f"}} // namespace {n}" for n in reversed(cpp_namespaces)] + ) + self.namespaces_ = cpp_namespaces + self.entity_name_ = entity_name + + @staticmethod + def from_namespaced_entity( + namespaced_entity: str, max_level: int = 2 + ) -> NamespaceHelper: + """ + Generate helper from nested namespaces as long as class/function name. E.g.: "torch::lazy::add" + """ + names = namespaced_entity.split("::") + entity_name = names[-1] + namespace_str = "::".join(names[:-1]) + return NamespaceHelper( + namespace_str=namespace_str, entity_name=entity_name, max_level=max_level + ) + + @property + def prologue(self) -> str: + return self.prologue_ + + @property + def epilogue(self) -> str: + return self.epilogue_ + + @property + def entity_name(self) -> str: + return self.entity_name_ + + # Only allow certain level of namespaces + def get_cpp_namespace(self, default: str = "") -> str: + """ + Return the namespace string from joining all the namespaces by "::" (hence no leading "::"). + Return default if namespace string is empty. + """ + return self.cpp_namespace_ if self.cpp_namespace_ else default + + +class OrderedSet(Generic[T]): + storage: dict[T, Literal[None]] + + def __init__(self, iterable: Iterable[T] | None = None) -> None: + if iterable is None: + self.storage = {} + else: + self.storage = dict.fromkeys(iterable) + + def __contains__(self, item: T) -> bool: + return item in self.storage + + def __iter__(self) -> Iterator[T]: + return iter(self.storage.keys()) + + def update(self, items: OrderedSet[T]) -> None: + self.storage.update(items.storage) + + def add(self, item: T) -> None: + self.storage[item] = None + + def copy(self) -> OrderedSet[T]: + ret: OrderedSet[T] = OrderedSet() + ret.storage = self.storage.copy() + return ret + + @staticmethod + def union(*args: OrderedSet[T]) -> OrderedSet[T]: + ret = args[0].copy() + for s in args[1:]: + ret.update(s) + return ret + + def __or__(self, other: OrderedSet[T]) -> OrderedSet[T]: + return OrderedSet.union(self, other) + + def __ior__(self, other: OrderedSet[T]) -> Self: + self.update(other) + return self + + def __eq__(self, other: object) -> bool: + if isinstance(other, OrderedSet): + return self.storage == other.storage + else: + return set(self.storage.keys()) == other diff --git a/torchgen/yaml_utils.py b/torchgen/yaml_utils.py new file mode 100644 index 00000000000..0278af84bf6 --- /dev/null +++ b/torchgen/yaml_utils.py @@ -0,0 +1,26 @@ +# Safely load fast C Yaml loader/dumper if they are available +try: + from yaml import CSafeLoader as Loader +except ImportError: + from yaml import SafeLoader as Loader # type: ignore[assignment, misc] + +try: + from yaml import CSafeDumper as Dumper +except ImportError: + from yaml import SafeDumper as Dumper # type: ignore[assignment, misc] +YamlDumper = Dumper + + +# A custom loader for YAML that errors on duplicate keys. +# This doesn't happen by default: see https://github.com/yaml/pyyaml/issues/165 +class YamlLoader(Loader): + def construct_mapping(self, node, deep=False): # type: ignore[no-untyped-def] + mapping = [] + for key_node, value_node in node.value: + key = self.construct_object(key_node, deep=deep) # type: ignore[no-untyped-call] + assert ( + key not in mapping + ), f"Found a duplicate key in the yaml. key={key}, line={node.start_mark.line}" + mapping.append(key) + mapping = super().construct_mapping(node, deep=deep) # type: ignore[no-untyped-call] + return mapping