llvm · mtrofin · May 22, 2025 · Apr 1, 2025 · Apr 1, 2025 · Apr 2, 2025
diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
@@ -347,3 +347,96 @@ clang.
     TODO(mtrofin): 
         - logging, and the use in interactive mode.
         - discuss an example (like the inliner)
+
+IR2Vec Embeddings
+=================
+
+IR2Vec is a program embedding approach designed specifically for LLVM IR. It
+is implemented as a function analysis pass in LLVM. The IR2Vec embeddings
+capture syntactic, semantic, and structural properties of the IR through 
+learned representations. These representations are obtained as a JSON 
+vocabulary that maps the entities of the IR (opcodes, types, operands) to 
+n-dimensional floating point vectors (embeddings). 
+
+With IR2Vec, representation at different granularities of IR, such as
+instructions, functions, and basic blocks, can be obtained. Representations 
+of loops and regions can be derived from these representations, which can be
+useful in different scenarios. The representations can be useful for various
+downstream tasks, including ML-guided compiler optimizations.
+
+The core components are:
+  - **Vocabulary**: A mapping from IR entities (opcodes, types, etc.) to their
+    vector representations. This is managed by ``IR2VecVocabAnalysis``.
+  - **Embedder**: A class (``ir2vec::Embedder``) that uses the vocabulary to
+    compute embeddings for instructions, basic blocks, and functions.
+
+Using IR2Vec
+------------
+
+For generating embeddings, first the vocabulary should be obtained. Then, the 
+embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance.
+
+1. **Get the Vocabulary**:
+   In a ModulePass, get the vocabulary analysis result:
+
+   .. code-block:: c++
+
+      auto &VocabRes = MAM.getResult<IR2VecVocabAnalysis>(M);
+      if (!VocabRes.isValid()) {
+        // Handle error: vocabulary is not available or invalid
+        return;
+      }
+      const ir2vec::Vocab &Vocabulary = VocabRes.getVocabulary();
+      unsigned Dimension = VocabRes.getDimension();
+
+    Note that ``IR2VecVocabAnalysis`` pass is immutable.
+
+2. **Create Embedder instance**:
+   With the vocabulary, create an embedder for a specific function:
+
+   .. code-block:: c++
+
+      // Assuming F is an llvm::Function&
+      // For example, using IR2VecKind::Symbolic:
+      Expected<std::unique_ptr<ir2vec::Embedder>> EmbOrErr =
+          ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary, Dimension);
+
+      if (auto Err = EmbOrErr.takeError()) {
+        // Handle error in embedder creation
+        return;
+      }
+      std::unique_ptr<ir2vec::Embedder> Emb = std::move(*EmbOrErr);
+
+3. **Compute and Access Embeddings**:
+   Call ``computeEmbeddings()`` on the embedder instance to compute the 
+   embeddings. Then the embeddings can be accessed using different getter 
+   methods. Currently, ``Embedder`` can generate embeddings at three levels:
+   Instructions, Basic Blocks, and Functions.
+
+   .. code-block:: c++
+
+      Emb->computeEmbeddings();
+      const ir2vec::Embedding &FuncVector = Emb->getFunctionVector();
+      const ir2vec::InstEmbeddingsMap &InstVecMap = Emb->getInstVecMap();
+      const ir2vec::BBEmbeddingsMap &BBVecMap = Emb->getBBVecMap();
+
+      // Example: Iterate over instruction embeddings
+      for (const auto &Entry : InstVecMap) {
+        const Instruction *Inst = Entry.getFirst();
+        const ir2vec::Embedding &InstEmbedding = Entry.getSecond();
+        // Use Inst and InstEmbedding
+      }
+
+4. **Working with Embeddings:**
+   Embeddings are represented as ``std::vector<double>``. These
+   vectors as features for machine learning models, compute similarity scores
+   between different code snippets, or perform other analyses as needed.
+
+Further Details
+---------------
+
+For more detailed information about the IR2Vec algorithm, its parameters, and
+advanced usage, please refer to the original paper:
+`IR2Vec: LLVM IR Based Scalable Program Embeddings <https://doi.org/10.1145/3418463>`_.
+The LLVM source code for ``IR2Vec`` can also be explored to understand the 
+implementation details.
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -0,0 +1,200 @@
+//===- IR2Vec.h - Implementation of IR2Vec ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions. See the LICENSE file for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the IR2Vec vocabulary analysis(IR2VecVocabAnalysis),
+/// the core ir2vec::Embedder interface for generating IR embeddings,
+/// and related utilities like the IR2VecPrinterPass.
+///
+/// Program Embeddings are typically or derived-from a learned
+/// representation of the program. Such embeddings are used to represent the
+/// programs as input to machine learning algorithms. IR2Vec represents the
+/// LLVM IR as embeddings.
+///
+/// The IR2Vec algorithm is described in the following paper:
+///
+///   IR2Vec: LLVM IR Based Scalable Program Embeddings, S. VenkataKeerthy,
+///   Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar, Ramakrishna
+///   Upadrasta, and Y. N. Srikant, ACM Transactions on Architecture and
+///   Code Optimization (TACO), 2020. https://doi.org/10.1145/3418463.
+///   https://arxiv.org/abs/1909.06228
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_IR2VEC_H
+#define LLVM_ANALYSIS_IR2VEC_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/ErrorOr.h"
+#include <map>
+
+namespace llvm {
+
+class Module;
+class BasicBlock;
+class Instruction;
+class Function;
+class Type;
+class Value;
+class raw_ostream;
+
+/// IR2Vec computes two kinds of embeddings: Symbolic and Flow-aware.
+/// Symbolic embeddings capture the "syntactic" and "statistical correlation"
+/// of the IR entities. Flow-aware embeddings build on top of symbolic
+/// embeddings and additionally capture the flow information in the IR.
+/// IR2VecKind is used to specify the type of embeddings to generate.
+/// Currently, only Symbolic embeddings are supported.
+enum class IR2VecKind { Symbolic };
+
+namespace ir2vec {
+using Embedding = std::vector<double>;
+using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
+using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
+// FIXME: Current the keys are strings. This can be changed to
+// use integers for cheaper lookups.
+using Vocab = std::map<std::string, Embedding>;
+
+/// Embedder provides the interface to generate embeddings (vector
+/// representations) for instructions, basic blocks, and functions. The vector
+/// representations are generated using IR2Vec algorithms.
+///
+/// The Embedder class is an abstract class and it is intended to be
+/// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware.
+class Embedder {
+protected:
+  const Function &F;
+  const Vocab &Vocabulary;
+
+  /// Dimension of the vector representation; captured from the input vocabulary
+  const unsigned Dimension;
+
+  /// Weights for different entities (like opcode, arguments, types)
+  /// in the IR instructions to generate the vector representation.
+  const float OpcWeight, TypeWeight, ArgWeight;
+
+  // Utility maps - these are used to store the vector representations of
+  // instructions, basic blocks and functions.
+  Embedding FuncVector;
+  BBEmbeddingsMap BBVecMap;
+  InstEmbeddingsMap InstVecMap;
+
+  Embedder(const Function &F, const Vocab &Vocabulary, unsigned Dimension);
+
+  /// Lookup vocabulary for a given Key. If the key is not found, it returns a
+  /// zero vector.
+  Embedding lookupVocab(const std::string &Key) const;
+
+  /// Adds two vectors: Dst += Src
+  static void addVectors(Embedding &Dst, const Embedding &Src);
+
+  /// Adds Src vector scaled by Factor to Dst vector: Dst += Src * Factor
+  static void addScaledVector(Embedding &Dst, const Embedding &Src,
+                              float Factor);
+
+public:
+  virtual ~Embedder() = default;
+
+  /// Top level function to compute embeddings. It generates embeddings for all
+  /// the instructions and basic blocks in the function F. Logic of computing
+  /// the embeddings is specific to the kind of embeddings being computed.
+  virtual void computeEmbeddings() = 0;
+
+  /// Factory method to create an Embedder object.
+  static Expected<std::unique_ptr<Embedder>> create(IR2VecKind Mode,
+                                                    const Function &F,
+                                                    const Vocab &Vocabulary,
+                                                    unsigned Dimension);
+
+  /// Returns a map containing instructions and the corresponding vector
+  /// representations for a given module corresponding to the IR2Vec
+  /// algorithm.
+  const InstEmbeddingsMap &getInstVecMap() const { return InstVecMap; }
+
+  /// Returns a map containing basic block and the corresponding vector
+  /// representations for a given module corresponding to the IR2Vec
+  /// algorithm.
+  const BBEmbeddingsMap &getBBVecMap() const { return BBVecMap; }
+
+  /// Returns the vector representation for a given function corresponding to
+  /// the IR2Vec algorithm.
+  const Embedding &getFunctionVector() const { return FuncVector; }
+};
+
+/// Class for computing the Symbolic embeddings of IR2Vec.
+/// Symbolic embeddings are constructed based on the entity-level
+/// representations obtained from the Vocabulary.
+class SymbolicEmbedder : public Embedder {
+private:
+  /// Utility function to compute the vector representation for a given basic
+  /// block.
+  Embedding computeBB2Vec(const BasicBlock &BB);
+
+  /// Utility function to compute the vector representation for a given type.
+  Embedding getTypeEmbedding(const Type *Ty) const;
+
+  /// Utility function to compute the vector representation for a given
+  /// operand.
+  Embedding getOperandEmbedding(const Value *Op) const;
+
+public:
+  SymbolicEmbedder(const Function &F, const Vocab &Vocabulary,
+                   unsigned Dimension)
+      : Embedder(F, Vocabulary, Dimension) {
+    FuncVector = Embedding(Dimension, 0);
+  }
+  void computeEmbeddings() override;
+};
+
+} // namespace ir2vec
+
+/// Class for storing the result of the IR2VecVocabAnalysis.
+class IR2VecVocabResult {
+  ir2vec::Vocab Vocabulary;
+  bool Valid = false;
+
+public:
+  IR2VecVocabResult() = default;
+  IR2VecVocabResult(ir2vec::Vocab &&Vocabulary);
+
+  bool isValid() const { return Valid; }
+  const ir2vec::Vocab &getVocabulary() const;
+  unsigned getDimension() const;
+  bool invalidate(Module &M, const PreservedAnalyses &PA,
+                  ModuleAnalysisManager::Invalidator &Inv) const;
+};
+
+/// This analysis provides the vocabulary for IR2Vec. The vocabulary provides a
+/// mapping between an entity of the IR (like opcode, type, argument, etc.) and
+/// its corresponding embedding.
+class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
+  ir2vec::Vocab Vocabulary;
+  Error readVocabulary();
+
+public:
+  static AnalysisKey Key;
+  IR2VecVocabAnalysis() = default;
+  using Result = IR2VecVocabResult;
+  Result run(Module &M, ModuleAnalysisManager &MAM);
+};
+
+/// This pass prints the IR2Vec embeddings for instructions, basic blocks, and
+/// functions.
+class IR2VecPrinterPass : public PassInfoMixin<IR2VecPrinterPass> {
+  raw_ostream &OS;
+  void printVector(const ir2vec::Embedding &Vec) const;
+
+public:
+  explicit IR2VecPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_IR2VEC_H
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
@@ -78,6 +78,7 @@ add_llvm_component_library(LLVMAnalysis
   GlobalsModRef.cpp
   GuardUtils.cpp
   HeatUtils.cpp
+  IR2Vec.cpp
   IRSimilarityIdentifier.cpp
   IVDescriptors.cpp
   IVUsers.cpp