llvm
diff --git a/‎llvm/docs/MLGO.rst‎
Lines changed: 47 additions & 98 deletions b/‎llvm/docs/MLGO.rst‎
Lines changed: 47 additions & 98 deletions
diff --git a/‎llvm/include/llvm/Analysis/IR2Vec.h‎
Lines changed: 205 additions & 0 deletions b/‎llvm/include/llvm/Analysis/IR2Vec.h‎
Lines changed: 205 additions & 0 deletions
@@ -191,130 +191,79 @@ of loops and regions can be derived from these representations, which can be
 useful in different scenarios. The representations can be useful for various
 downstream tasks, including ML-guided compiler optimizations.
 
-Currently, to use IR2Vec embeddings, the JSON vocabulary first needs to be read
-and used to obtain the vocabulary mapping. Then, use this mapping to
-derive the representations. In LLVM, this process is implemented using two
-independent passes: ``IR2VecVocabAnalysis`` and ``IR2VecAnalysis``. The former
-reads the JSON vocabulary and populates ``IR2VecVocabResult``, which is then used
-by ``IR2VecAnalysis``. 
+The core components are:
+  - **Vocabulary**: A mapping from IR entities (opcodes, types, etc.) to their
+    vector representations. This is managed by ``IR2VecVocabAnalysis``.
+  - **Embedder**: A class (``ir2vec::Embedder``) that uses the vocabulary to
+    compute embeddings for instructions, basic blocks, and functions.
 
-``IR2VecVocabAnalysis`` is immutable and is intended to
-be run once before ``IR2VecAnalysis`` is run. In the future, we plan
-to improve this requirement by automatically generating default the vocabulary mappings
-during build time, eliminating the need for a separate file read.
+Using IR2Vec
+------------
 
-IR2VecAnalysis Usage
---------------------
+For generating embeddings, first the vocabulary should be obtained. Then, the 
+embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance.
 
-To use IR2Vec in an LLVM-based tool or pass, interaction with the analysis 
-results can be done through the following APIs:
-    
-1. **Accessing the Analysis Results:**
-
-   To access the IR2Vec embeddings, obtain the ``IR2VecAnalysis``
-   result from the Function Analysis Manager (FAM).
+1. **Get the Vocabulary**:
+   In a ModulePass, get the vocabulary analysis result:
 
    .. code-block:: c++
 
-      #include "llvm/Analysis/IR2VecAnalysis.h"
-
-      // ... other includes and code ...
-
-      llvm::FunctionAnalysisManager &FAM = ...; // The FAM instance
-      llvm::Function &F = ...; // The function to analyze
-      auto &IR2VecResult = FAM.getResult<llvm::IR2VecAnalysis>(F);
-
-2. **Checking for Valid Results:**
-
-   Ensure that the analysis result is valid before accessing the embeddings:
-
-   .. code-block:: c++
-
-      if (IR2VecResult.isValid()) {
-        // Proceed to access embeddings
+      auto &VocabRes = MAM.getResult<IR2VecVocabAnalysis>(M);
+      if (!VocabRes.isValid()) {
+        // Handle error: vocabulary is not available or invalid
+        return;
       }
+      const ir2vec::Vocab &Vocabulary = VocabRes.getVocabulary();
+      unsigned Dimension = VocabRes.getDimension();
 
-3. **Retrieving Embeddings:**
+    Note that ``IR2VecVocabAnalysis`` pass is immutable.
 
-   The ``IR2VecResult`` provides access to embeddings (currently) at three levels:
+2. **Create Embedder instance**:
+   With the vocabulary, create an embedder for a specific function:
 
-   - **Instruction Embeddings:**
+   .. code-block:: c++
 
-     .. code-block:: c++
+      // Assuming F is an llvm::Function&
+      // For example, using IR2VecKind::Symbolic:
+      ErrorOr<std::unique_ptr<ir2vec::Embedder>> EmbOrErr =
+          ir2vec::Embedder::create(IR2VecKind::Symbolic, F, Vocabulary, Dimension);
 
-        const auto &instVecMap = IR2VecResult.getInstVecMap();
-        // instVecMap is a SmallMapVector<const Instruction*, ir2vec::Embedding, 128>
-        for (const auto &it : instVecMap) {
-          const Instruction *I = it.first;
-          const ir2vec::Embedding &embedding = it.second;
-          // Use the instruction embedding
-        }
-   - **Basic Block Embeddings:**
+      if (auto EC = EmbOrErr.getError()) {
+        // Handle error in embedder creation
+        return;
+      }
+      std::unique_ptr<ir2vec::Embedder> Emb = std::move(*EmbOrErr);
 
-     .. code-block:: c++
+3. **Compute and Access Embeddings**:
+   Call ``computeEmbeddings()`` on the embedder instance to compute the 
+   embeddings. Then the embeddings can be accessed using different getter 
+   methods. Currently, ``Embedder`` can generate embeddings at three levels:
+   Instructions, Basic Blocks, and Functions.
 
-        const auto &bbVecMap = IR2VecResult.getBBVecMap();
-        // bbVecMap is a SmallMapVector<const BasicBlock*, ir2vec::Embedding, 16>
-        for (const auto &it : bbVecMap) {
-          const BasicBlock *BB = it.first;
-          const ir2vec::Embedding &embedding = it.second;
-          // Use the basic block embedding
-        }
-   - **Function Embedding:**
+   .. code-block:: c++
 
-     .. code-block:: c++
+      Emb->computeEmbeddings();
+      const ir2vec::Embedding &FuncVector = Emb->getFunctionVector();
+      const ir2vec::InstEmbeddingsMap &InstVecMap = Emb->getInstVecMap();
+      const ir2vec::BBEmbeddingsMap &BBVecMap = Emb->getBBVecMap();
 
-        const ir2vec::Embedding &funcEmbedding = IR2VecResult.getFunctionVector();
-        // Use the function embedding
+      // Example: Iterate over instruction embeddings
+      for (const auto &Entry : InstVecMap) {
+        const Instruction *Inst = Entry.getFirst();
+        const ir2vec::Embedding &InstEmbedding = Entry.getSecond();
+        // Use Inst and InstEmbedding
+      }
 
 4. **Working with Embeddings:**
-
    Embeddings are represented as ``std::vector<double>``. These
    vectors as features for machine learning models, compute similarity scores
    between different code snippets, or perform other analyses as needed.
 
-Example Usage
-^^^^^^^^^^^^^
-
-.. code-block:: c++
-
-   #include "llvm/Analysis/IR2VecAnalysis.h"
-   #include "llvm/IR/Function.h"
-   #include "llvm/IR/Instructions.h"
-   #include "llvm/Passes/PassBuilder.h"
-
-   // ... other includes and code ...
-
-   void processFunction(llvm::Function &F, llvm::FunctionAnalysisManager &FAM) {
-     auto &IR2VecResult = FAM.getResult<llvm::IR2VecAnalysis>(F);
-
-     if (IR2VecResult.isValid()) {
-       const auto &instVecMap = IR2VecResult.getInstVecMap();
-       for (const auto &it : instVecMap) {
-         const Instruction *I = it.first;
-         const auto &embedding = it.second;
-         llvm::errs() << "Instruction: " << *I << "\n";
-         llvm::errs() << "Embedding: ";
-         for (double val : embedding) {
-           llvm::errs() << val << " ";
-         }
-         llvm::errs() << "\n";
-       }
-     } else {
-       llvm::errs() << "IR2Vec analysis failed for function " << F.getName() << "\n";
-     }
-   }
-
-   // ... rest of the pass ...
-
-   // In the pass's run method:
-   // processFunction(F, FAM);
-
 Further Details
 ---------------
 
 For more detailed information about the IR2Vec algorithm, its parameters, and
 advanced usage, please refer to the original paper:
 `IR2Vec: LLVM IR Based Scalable Program Embeddings <https://doi.org/10.1145/3418463>`_.
-The LLVM source code for ``IR2VecAnalysis`` can also be explored to understand the 
+The LLVM source code for ``IR2Vec`` can also be explored to understand the 
 implementation details.
@@ -0,0 +1,205 @@
+//===- IR2Vec.h - Implementation of IR2Vec ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions. See the LICENSE file for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the IR2Vec vocabulary analysis(IR2VecVocabAnalysis),
+/// the core ir2vec::Embedder interface for generating IR embeddings,
+/// and related utilities like the IR2VecPrinterPass.
+///
+/// Program Embeddings are typically or derived-from a learned
+/// representation of the program. Such embeddings are used to represent the
+/// programs as input to machine learning algorithms. IR2Vec represents the
+/// LLVM IR as embeddings.
+///
+/// The IR2Vec algorithm is described in the following paper:
+///
+///   IR2Vec: LLVM IR Based Scalable Program Embeddings, S. VenkataKeerthy,
+///   Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar, Ramakrishna
+///   Upadrasta, and Y. N. Srikant, ACM Transactions on Architecture and
+///   Code Optimization (TACO), 2020. https://doi.org/10.1145/3418463.
+///   https://arxiv.org/abs/1909.06228
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_IR2VEC_H
+#define LLVM_ANALYSIS_IR2VEC_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/ErrorOr.h"
+#include <map>
+
+namespace llvm {
+
+class Module;
+class BasicBlock;
+class Instruction;
+class Function;
+class Type;
+class Value;
+class raw_ostream;
+
+/// IR2Vec computes two kinds of embeddings: Symbolic and Flow-aware.
+/// Symbolic embeddings capture the "syntactic" and "statistical correlation"
+/// of the IR entities. Flow-aware embeddings build on top of symbolic
+/// embeddings and additionally capture the flow information in the IR.
+/// IR2VecKind is used to specify the type of embeddings to generate.
+/// Currently, only Symbolic embeddings are supported.
+enum class IR2VecKind { Symbolic };
+
+namespace ir2vec {
+using Embedding = std::vector<double>;
+using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
+using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
+// FIXME: Current the keys are strings. This can be changed to
+// use integers for cheaper lookups.
+using Vocab = std::map<std::string, Embedding>;
+
+/// Embedder provides the interface to generate embeddings (vector
+/// representations) for instructions, basic blocks, and functions. The vector
+/// representations are generated using IR2Vec algorithms.
+///
+/// The Embedder class is an abstract class and it is intended to be
+/// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware.
+class Embedder {
+protected:
+  const Function &F;
+  const Vocab &Vocabulary;
+
+  /// Weights for different entities (like opcode, arguments, types)
+  /// in the IR instructions to generate the vector representation.
+  // FIXME: Defaults to the values used in the original algorithm. Can be
+  // parameterized later.
+  const float OpcWeight = 1.0, TypeWeight = 0.5, ArgWeight = 0.2;
+
+  /// Dimension of the vector representation; captured from the input vocabulary
+  const unsigned Dimension;
+
+  // Utility maps - these are used to store the vector representations of
+  // instructions, basic blocks and functions.
+  Embedding FuncVector;
+  BBEmbeddingsMap BBVecMap;
+  InstEmbeddingsMap InstVecMap;
+
+  Embedder(const Function &F, const Vocab &Vocabulary, unsigned Dimension);
+
+  /// Lookup vocabulary for a given Key. If the key is not found, it returns a
+  /// zero vector.
+  Embedding lookupVocab(const std::string &Key);
+
+  /// Adds two vectors: Dst += Src
+  void addVectors(Embedding &Dst, const Embedding &Src);
+
+  /// Adds Src vector scaled by Factor to Dst vector: Dst += Src * Factor
+  void addScaledVector(Embedding &Dst, const Embedding &Src, float Factor);
+
+public:
+  virtual ~Embedder() = default;
+
+  /// Top level function to compute embeddings. Given a function, it
+  /// generates embeddings for all the instructions and basic blocks in that
+  /// function. Logic of computing the embeddings is specific to the kind of
+  /// embeddings being computed.
+  virtual void computeEmbeddings() = 0;
+
+  /// Factory method to create an Embedder object.
+  static ErrorOr<std::unique_ptr<Embedder>> create(IR2VecKind Mode,
+                                                   const Function &F,
+                                                   const Vocab &Vocabulary,
+                                                   unsigned Dimension);
+
+  /// Returns a map containing instructions and the corresponding vector
+  /// representations for a given module corresponding to the IR2Vec
+  /// algorithm.
+  const InstEmbeddingsMap &getInstVecMap() const { return InstVecMap; }
+
+  /// Returns a map containing basic block and the corresponding vector
+  /// representations for a given module corresponding to the IR2Vec
+  /// algorithm.
+  const BBEmbeddingsMap &getBBVecMap() const { return BBVecMap; }
+
+  /// Returns the vector representation for a given function corresponding to
+  /// the IR2Vec algorithm.
+  const Embedding &getFunctionVector() const { return FuncVector; }
+};
+
+/// Class for computing the Symbolic embeddings of IR2Vec
+class SymbolicEmbedder : public Embedder {
+private:
+  /// Utility function to compute the vector representation for a given basic
+  /// block.
+  Embedding computeBB2Vec(const BasicBlock &BB);
+
+  /// Utility function to compute the vector representation for a given
+  /// function.
+  Embedding computeFunc2Vec();
+
+  /// Utility function to compute the vector representation for a given type.
+  Embedding getTypeEmbedding(const Type *Ty);
+
+  /// Utility function to compute the vector representation for a given
+  /// operand.
+  Embedding getOperandEmbedding(const Value *Op);
+
+public:
+  SymbolicEmbedder(const Function &F, const Vocab &Vocabulary,
+                   unsigned Dimension)
+      : Embedder(F, Vocabulary, Dimension) {
+    FuncVector = Embedding(Dimension, 0);
+  }
+  void computeEmbeddings() override;
+};
+
+} // namespace ir2vec
+
+class IR2VecVocabResult;
+
+/// This analysis provides the vocabulary for IR2Vec. The vocabulary provides a
+/// mapping between an entity of the IR (like opcode, type, argument, etc.) and
+/// its corresponding embedding.
+class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
+  ir2vec::Vocab Vocabulary;
+  Error readVocabulary();
+
+public:
+  static AnalysisKey Key;
+  IR2VecVocabAnalysis() = default;
+  using Result = IR2VecVocabResult;
+  Result run(Module &M, ModuleAnalysisManager &MAM);
+};
+
+class IR2VecVocabResult {
+  ir2vec::Vocab Vocabulary;
+  bool Valid = false;
+
+public:
+  IR2VecVocabResult() = default;
+  IR2VecVocabResult(ir2vec::Vocab &&Vocabulary);
+
+  bool isValid() const { return Valid; }
+  const ir2vec::Vocab &getVocabulary() const;
+  unsigned getDimension() const;
+  bool invalidate(Module &M, const PreservedAnalyses &PA,
+                  ModuleAnalysisManager::Invalidator &Inv);
+};
+
+/// This pass prints the IR2Vec embeddings for instructions, basic blocks, and
+/// functions.
+class IR2VecPrinterPass : public PassInfoMixin<IR2VecPrinterPass> {
+  raw_ostream &OS;
+  void printVector(const ir2vec::Embedding &Vec) const;
+
+public:
+  explicit IR2VecPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_IR2VEC_H