Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 177 additions & 41 deletions llvm/include/llvm/Analysis/IR2Vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "llvm/Support/JSON.h"
#include <array>
#include <map>
#include <optional>

namespace llvm {

Expand Down Expand Up @@ -144,6 +145,73 @@ struct Embedding {
using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;

/// Generic storage class for section-based vocabularies.
/// VocabStorage provides a generic foundation for storing and accessing
/// embeddings organized into sections.
class VocabStorage {
private:
/// Section-based storage
std::vector<std::vector<Embedding>> Sections;

const size_t TotalSize;
const unsigned Dimension;

public:
/// Default constructor creates empty storage (invalid state)
VocabStorage() : Sections(), TotalSize(0), Dimension(0) {}

/// Create a VocabStorage with pre-organized section data
VocabStorage(std::vector<std::vector<Embedding>> &&SectionData);

VocabStorage(VocabStorage &&) = default;
VocabStorage &operator=(VocabStorage &&) = delete;

VocabStorage(const VocabStorage &) = delete;
VocabStorage &operator=(const VocabStorage &) = delete;

/// Get total number of entries across all sections
size_t size() const { return TotalSize; }

/// Get number of sections
unsigned getNumSections() const {
return static_cast<unsigned>(Sections.size());
}

/// Section-based access: Storage[sectionId][localIndex]
const std::vector<Embedding> &operator[](unsigned SectionId) const {
assert(SectionId < Sections.size() && "Invalid section ID");
return Sections[SectionId];
}

/// Get vocabulary dimension
unsigned getDimension() const { return Dimension; }

/// Check if vocabulary is valid (has data)
bool isValid() const { return TotalSize > 0; }

/// Iterator support for section-based access
class const_iterator {
const VocabStorage *Storage;
unsigned SectionId = 0;
size_t LocalIndex = 0;

public:
const_iterator(const VocabStorage *Storage, unsigned SectionId,
size_t LocalIndex)
: Storage(Storage), SectionId(SectionId), LocalIndex(LocalIndex) {}

LLVM_ABI const Embedding &operator*() const;
LLVM_ABI const_iterator &operator++();
LLVM_ABI bool operator==(const const_iterator &Other) const;
LLVM_ABI bool operator!=(const const_iterator &Other) const;
};

const_iterator begin() const { return const_iterator(this, 0, 0); }
const_iterator end() const {
return const_iterator(this, getNumSections(), 0);
}
};

/// Class for storing and accessing the IR2Vec vocabulary.
/// The Vocabulary class manages seed embeddings for LLVM IR entities. The
/// seed embeddings are the initial learned representations of the entities
Expand All @@ -164,7 +232,7 @@ using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
class Vocabulary {
friend class llvm::IR2VecVocabAnalysis;

// Vocabulary Slot Layout:
// Vocabulary Layout:
// +----------------+------------------------------------------------------+
// | Entity Type | Index Range |
// +----------------+------------------------------------------------------+
Expand All @@ -180,8 +248,16 @@ class Vocabulary {
// and improves learning. Operands include Comparison predicates
// (ICmp/FCmp) along with other operand types. This can be extended to
// include other specializations in future.
using VocabVector = std::vector<ir2vec::Embedding>;
VocabVector Vocab;
enum class Section : unsigned {
Opcodes = 0,
CanonicalTypes = 1,
Operands = 2,
Predicates = 3,
MaxSections
};

// Use section-based storage for better organization and efficiency
VocabStorage Storage;

static constexpr unsigned NumICmpPredicates =
static_cast<unsigned>(CmpInst::LAST_ICMP_PREDICATE) -
Expand Down Expand Up @@ -233,10 +309,23 @@ class Vocabulary {
NumICmpPredicates + NumFCmpPredicates;

Vocabulary() = default;
LLVM_ABI Vocabulary(VocabVector &&Vocab) : Vocab(std::move(Vocab)) {}
LLVM_ABI Vocabulary(VocabStorage &&Storage) : Storage(std::move(Storage)) {}

Vocabulary(const Vocabulary &) = delete;
Vocabulary &operator=(const Vocabulary &) = delete;

Vocabulary(Vocabulary &&) = default;
Vocabulary &operator=(Vocabulary &&Other) = delete;

LLVM_ABI bool isValid() const {
return Storage.size() == NumCanonicalEntries;
}

LLVM_ABI unsigned getDimension() const {
assert(isValid() && "IR2Vec Vocabulary is invalid");
return Storage.getDimension();
}

LLVM_ABI bool isValid() const { return Vocab.size() == NumCanonicalEntries; };
LLVM_ABI unsigned getDimension() const;
/// Total number of entries (opcodes + canonicalized types + operand kinds +
/// predicates)
static constexpr size_t getCanonicalSize() { return NumCanonicalEntries; }
Expand All @@ -245,59 +334,91 @@ class Vocabulary {
LLVM_ABI static StringRef getVocabKeyForOpcode(unsigned Opcode);

/// Function to get vocabulary key for a given TypeID
LLVM_ABI static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
LLVM_ABI static StringRef getVocabKeyForTypeID(Type::TypeID TypeID) {
return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID));
}

/// Function to get vocabulary key for a given OperandKind
LLVM_ABI static StringRef getVocabKeyForOperandKind(OperandKind Kind);
LLVM_ABI static StringRef getVocabKeyForOperandKind(OperandKind Kind) {
unsigned Index = static_cast<unsigned>(Kind);
assert(Index < MaxOperandKinds && "Invalid OperandKind");
return OperandKindNames[Index];
}

/// Function to classify an operand into OperandKind
LLVM_ABI static OperandKind getOperandKind(const Value *Op);

/// Function to get vocabulary key for a given predicate
LLVM_ABI static StringRef getVocabKeyForPredicate(CmpInst::Predicate P);

/// Functions to return the slot index or position of a given Opcode, TypeID,
/// or OperandKind in the vocabulary.
LLVM_ABI static unsigned getSlotIndex(unsigned Opcode);
LLVM_ABI static unsigned getSlotIndex(Type::TypeID TypeID);
LLVM_ABI static unsigned getSlotIndex(const Value &Op);
LLVM_ABI static unsigned getSlotIndex(CmpInst::Predicate P);
/// Functions to return flat index
LLVM_ABI static unsigned getIndex(unsigned Opcode) {
assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
return Opcode - 1; // Convert to zero-based index
}

LLVM_ABI static unsigned getIndex(Type::TypeID TypeID) {
assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID));
}

LLVM_ABI static unsigned getIndex(const Value &Op) {
unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
assert(Index < MaxOperandKinds && "Invalid OperandKind");
return OperandBaseOffset + Index;
}

LLVM_ABI static unsigned getIndex(CmpInst::Predicate P) {
return PredicateBaseOffset + getPredicateLocalIndex(P);
}

/// Accessors to get the embedding for a given entity.
LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const;
LLVM_ABI const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
LLVM_ABI const ir2vec::Embedding &operator[](const Value &Arg) const;
LLVM_ABI const ir2vec::Embedding &operator[](CmpInst::Predicate P) const;
LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const {
assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
return Storage[static_cast<unsigned>(Section::Opcodes)][Opcode - 1];
}

LLVM_ABI const ir2vec::Embedding &operator[](Type::TypeID TypeID) const {
assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
unsigned LocalIndex = static_cast<unsigned>(getCanonicalTypeID(TypeID));
return Storage[static_cast<unsigned>(Section::CanonicalTypes)][LocalIndex];
}

LLVM_ABI const ir2vec::Embedding &operator[](const Value &Arg) const {
unsigned LocalIndex = static_cast<unsigned>(getOperandKind(&Arg));
assert(LocalIndex < MaxOperandKinds && "Invalid OperandKind");
return Storage[static_cast<unsigned>(Section::Operands)][LocalIndex];
}

LLVM_ABI const ir2vec::Embedding &operator[](CmpInst::Predicate P) const {
unsigned LocalIndex = getPredicateLocalIndex(P);
return Storage[static_cast<unsigned>(Section::Predicates)][LocalIndex];
}

/// Const Iterator type aliases
using const_iterator = VocabVector::const_iterator;
using const_iterator = VocabStorage::const_iterator;

const_iterator begin() const {
assert(isValid() && "IR2Vec Vocabulary is invalid");
return Vocab.begin();
return Storage.begin();
}

const_iterator cbegin() const {
assert(isValid() && "IR2Vec Vocabulary is invalid");
return Vocab.cbegin();
}
const_iterator cbegin() const { return begin(); }

const_iterator end() const {
assert(isValid() && "IR2Vec Vocabulary is invalid");
return Vocab.end();
return Storage.end();
}

const_iterator cend() const {
assert(isValid() && "IR2Vec Vocabulary is invalid");
return Vocab.cend();
}
const_iterator cend() const { return end(); }

/// Returns the string key for a given index position in the vocabulary.
/// This is useful for debugging or printing the vocabulary. Do not use this
/// for embedding generation as string based lookups are inefficient.
LLVM_ABI static StringRef getStringKey(unsigned Pos);

/// Create a dummy vocabulary for testing purposes.
LLVM_ABI static VocabVector createDummyVocabForTest(unsigned Dim = 1);
LLVM_ABI static VocabStorage createDummyVocabForTest(unsigned Dim = 1);

LLVM_ABI bool invalidate(Module &M, const PreservedAnalyses &PA,
ModuleAnalysisManager::Invalidator &Inv) const;
Expand All @@ -306,12 +427,16 @@ class Vocabulary {
constexpr static unsigned NumCanonicalEntries =
MaxOpcodes + MaxCanonicalTypeIDs + MaxOperandKinds + MaxPredicateKinds;

// Base offsets for slot layout to simplify index computation
// Base offsets for flat index computation
constexpr static unsigned OperandBaseOffset =
MaxOpcodes + MaxCanonicalTypeIDs;
constexpr static unsigned PredicateBaseOffset =
OperandBaseOffset + MaxOperandKinds;

/// Functions for predicate index calculations
static unsigned getPredicateLocalIndex(CmpInst::Predicate P);
static CmpInst::Predicate getPredicateFromLocalIndex(unsigned LocalIndex);

/// String mappings for CanonicalTypeID values
static constexpr StringLiteral CanonicalTypeNames[] = {
"FloatTy", "VoidTy", "LabelTy", "MetadataTy",
Expand Down Expand Up @@ -358,15 +483,26 @@ class Vocabulary {

/// Function to get vocabulary key for canonical type by enum
LLVM_ABI static StringRef
getVocabKeyForCanonicalTypeID(CanonicalTypeID CType);
getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) {
unsigned Index = static_cast<unsigned>(CType);
assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID");
return CanonicalTypeNames[Index];
}

/// Function to convert TypeID to CanonicalTypeID
LLVM_ABI static CanonicalTypeID getCanonicalTypeID(Type::TypeID TypeID);
LLVM_ABI static CanonicalTypeID getCanonicalTypeID(Type::TypeID TypeID) {
unsigned Index = static_cast<unsigned>(TypeID);
assert(Index < MaxTypeIDs && "Invalid TypeID");
return TypeIDMapping[Index];
}

/// Function to get the predicate enum value for a given index. Index is
/// relative to the predicates section of the vocabulary. E.g., Index 0
/// corresponds to the first predicate.
LLVM_ABI static CmpInst::Predicate getPredicate(unsigned Index);
LLVM_ABI static CmpInst::Predicate getPredicate(unsigned Index) {
assert(Index < MaxPredicateKinds && "Invalid predicate index");
return getPredicateFromLocalIndex(Index);
}
};

/// Embedder provides the interface to generate embeddings (vector
Expand Down Expand Up @@ -459,22 +595,22 @@ class LLVM_ABI FlowAwareEmbedder : public Embedder {
/// mapping between an entity of the IR (like opcode, type, argument, etc.) and
/// its corresponding embedding.
class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
using VocabVector = std::vector<ir2vec::Embedding>;
using VocabMap = std::map<std::string, ir2vec::Embedding>;
VocabMap OpcVocab, TypeVocab, ArgVocab;
VocabVector Vocab;
std::optional<ir2vec::VocabStorage> Vocab;

Error readVocabulary();
Error readVocabulary(VocabMap &OpcVocab, VocabMap &TypeVocab,
VocabMap &ArgVocab);
Error parseVocabSection(StringRef Key, const json::Value &ParsedVocabValue,
VocabMap &TargetVocab, unsigned &Dim);
void generateNumMappedVocab();
void generateVocabStorage(VocabMap &OpcVocab, VocabMap &TypeVocab,
VocabMap &ArgVocab);
void emitError(Error Err, LLVMContext &Ctx);

public:
LLVM_ABI static AnalysisKey Key;
IR2VecVocabAnalysis() = default;
LLVM_ABI explicit IR2VecVocabAnalysis(const VocabVector &Vocab);
LLVM_ABI explicit IR2VecVocabAnalysis(VocabVector &&Vocab);
LLVM_ABI explicit IR2VecVocabAnalysis(ir2vec::VocabStorage &&Vocab)
: Vocab(std::move(Vocab)) {}
using Result = ir2vec::Vocabulary;
LLVM_ABI Result run(Module &M, ModuleAnalysisManager &MAM);
};
Expand Down
Loading