Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 175 additions & 0 deletions bolt/include/bolt/Core/MCInstUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
//===- bolt/Core/MCInstUtils.h ----------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef BOLT_CORE_MCINSTUTILS_H
#define BOLT_CORE_MCINSTUTILS_H

#include "bolt/Core/BinaryBasicBlock.h"
#include <map>
#include <variant>

namespace llvm {
class MCCodeEmitter;
}

namespace llvm {
namespace bolt {

class BinaryFunction;

/// MCInstReference represents a reference to a constant MCInst as stored either
/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
/// (after a CFG is created).
///
/// The reference may be invalidated when the function containing the referenced
/// instruction is modified.
class MCInstReference {
public:
using nocfg_const_iterator = std::map<uint32_t, MCInst>::const_iterator;

/// Constructs an empty reference.
MCInstReference() : Reference(RefInBB(nullptr, /*Index=*/0)) {}
/// Constructs a reference to the instruction inside the basic block.
MCInstReference(const BinaryBasicBlock *BB, const MCInst *Inst)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, if you don't expect to pass/handle nullptr as a pointer, prefer a reference. You might need a special case for an empty reference.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for pointing this out, I replaced a number of pointer-typed arguments with references in 0558f1b and now it should be clearer to the user when empty references can be returned (the privately-used RefInBB and RefInBF helper classes still use pointers, though). Furthermore, I made MCInstReference::get accept both arguments by reference in 5a7414c: while it could have sense to return empty reference for (nullptr, SomeFunction) arguments (but even this usage is slightly questionable), it is definitely strange to pass unrelated instruction and function and expect the MCInstReference::get to silently return an empty reference.

: Reference(RefInBB(BB, getInstIndexInBB(BB, Inst))) {}
/// Constructs a reference to the instruction inside the basic block.
MCInstReference(const BinaryBasicBlock *BB, unsigned Index)
: Reference(RefInBB(BB, Index)) {
assert(BB && "Basic block should not be nullptr");
}
/// Constructs a reference to the instruction inside the function without
/// CFG information.
MCInstReference(const BinaryFunction *BF, nocfg_const_iterator It)
: Reference(RefInBF(BF, It)) {
assert(BF && "Function should not be nullptr");
}

/// Locates an instruction inside a function and returns a reference.
static MCInstReference get(const MCInst *Inst, const BinaryFunction &BF);

bool operator==(const MCInstReference &Other) const {
return Reference == Other.Reference;
}

const MCInst &getMCInst() const {
assert(!empty() && "Empty reference");
if (auto *Ref = tryGetRefInBB()) {
[[maybe_unused]] unsigned NumInstructions = Ref->BB->size();
assert(Ref->Index < NumInstructions && "Invalid reference");
return Ref->BB->getInstructionAtIndex(Ref->Index);
}
return getRefInBF().It->second;
}

operator const MCInst &() const { return getMCInst(); }

bool empty() const {
if (auto *Ref = tryGetRefInBB())
return Ref->BB == nullptr;
return getRefInBF().BF == nullptr;
}

bool hasCFG() const { return !empty() && tryGetRefInBB() != nullptr; }

const BinaryFunction *getFunction() const {
assert(!empty() && "Empty reference");
if (auto *Ref = tryGetRefInBB())
return Ref->BB->getFunction();
return getRefInBF().BF;
}

const BinaryBasicBlock *getBasicBlock() const {
assert(!empty() && "Empty reference");
if (auto *Ref = tryGetRefInBB())
return Ref->BB;
return nullptr;
}

// MCCodeEmitter is not thread safe.
uint64_t getAddress(const MCCodeEmitter *Emitter = nullptr) const;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add documentation for this method. What's the expected address returned?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Writing the documentation made it obvious to me that a name starting with "get" is rather misleading for this method:

  • it does not just retrieve the address from a field of some object, it performs computations every time
  • it does not return "the" address, it returns an approximation that is expected to be good enough for debug printing (and for the addresses reported by gadget scanner to the user, which are, frankly speaking, reported on the best-effort basis)

For that reason, I renamed this method in b068098 and refactored its callers a bit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. To clarify, the expected output is the address of the instruction in the input binary? I.e. it's only valid in the context of instructions that were present in the input?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... My particular use case is "read-only" analysis, but initially I thought getAddress() should work equally well for the "original" and "rewritten" code. Now I see getAddress() can only be as accurate as BinaryFunction::getAddress() and BinaryBasicBlock::getOffset().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, BinaryFunction::getAddress() returns the original address of the function. For the output, we don't know the address until code emission and mapping. If you are interested in output addresses, you can take a look at BinaryFunction::translateInputToOutputAddress() for approximation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for clarification, updated the description in ec6bff9.


raw_ostream &print(raw_ostream &OS) const;

private:
static unsigned getInstIndexInBB(const BinaryBasicBlock *BB,
const MCInst *Inst) {
assert(BB && Inst && "Neither BB nor Inst should be nullptr");
// Usage of pointer arithmetic assumes the instructions are stored in a
// vector, see BasicBlockStorageIsVector in MCInstUtils.cpp.
const MCInst *FirstInstInBB = &*BB->begin();
return Inst - FirstInstInBB;
}

// Two cases are possible:
// * functions with CFG reconstructed - a function stores a collection of
// basic blocks, each basic block stores a contiguous vector of MCInst
// * functions without CFG - there are no basic blocks created,
// the instructions are directly stored in std::map in BinaryFunction
//
// In both cases, the direct parent of MCInst is stored together with an
// index or iterator pointing to the instruction.

// Helper struct: CFG is available, the direct parent is a basic block.
struct RefInBB {
RefInBB(const BinaryBasicBlock *BB, unsigned Index)
: BB(BB), Index(Index) {}
RefInBB(const RefInBB &Other) = default;
RefInBB &operator=(const RefInBB &Other) = default;

const BinaryBasicBlock *BB;
unsigned Index;

bool operator==(const RefInBB &Other) const {
return BB == Other.BB && Index == Other.Index;
}
};

// Helper struct: CFG is *not* available, the direct parent is a function,
// iterator's type is std::map<uint32_t, MCInst>::iterator (the mapped value
// is an instruction's offset).
struct RefInBF {
RefInBF(const BinaryFunction *BF, nocfg_const_iterator It)
: BF(BF), It(It) {}
RefInBF(const RefInBF &Other) = default;
RefInBF &operator=(const RefInBF &Other) = default;

const BinaryFunction *BF;
nocfg_const_iterator It;

bool operator==(const RefInBF &Other) const {
return BF == Other.BF && It->first == Other.It->first;
}
};

std::variant<RefInBB, RefInBF> Reference;

// Utility methods to be used like this:
//
// if (auto *Ref = tryGetRefInBB())
// return Ref->doSomething(...);
// return getRefInBF().doSomethingElse(...);
const RefInBB *tryGetRefInBB() const {
assert(std::get_if<RefInBB>(&Reference) ||
std::get_if<RefInBF>(&Reference));
return std::get_if<RefInBB>(&Reference);
}
const RefInBF &getRefInBF() const {
assert(std::get_if<RefInBF>(&Reference));
return *std::get_if<RefInBF>(&Reference);
}
};

static inline raw_ostream &operator<<(raw_ostream &OS,
const MCInstReference &Ref) {
return Ref.print(OS);
}

} // namespace bolt
} // namespace llvm

#endif
176 changes: 1 addition & 175 deletions bolt/include/bolt/Passes/PAuthGadgetScanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,187 +11,13 @@

#include "bolt/Core/BinaryContext.h"
#include "bolt/Core/BinaryFunction.h"
#include "bolt/Core/MCInstUtils.h"
#include "bolt/Passes/BinaryPasses.h"
#include "llvm/Support/raw_ostream.h"
#include <memory>

namespace llvm {
namespace bolt {

/// @brief MCInstReference represents a reference to an MCInst as stored either
/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
/// (after a CFG is created). It aims to store the necessary information to be
/// able to find the specific MCInst in either the BinaryFunction or
/// BinaryBasicBlock data structures later, so that e.g. the InputAddress of
/// the corresponding instruction can be computed.

struct MCInstInBBReference {
BinaryBasicBlock *BB;
int64_t BBIndex;
MCInstInBBReference(BinaryBasicBlock *BB, int64_t BBIndex)
: BB(BB), BBIndex(BBIndex) {}
MCInstInBBReference() : BB(nullptr), BBIndex(0) {}
static MCInstInBBReference get(const MCInst *Inst, BinaryFunction &BF) {
for (BinaryBasicBlock &BB : BF)
for (size_t I = 0; I < BB.size(); ++I)
if (Inst == &BB.getInstructionAtIndex(I))
return MCInstInBBReference(&BB, I);
return {};
}
bool operator==(const MCInstInBBReference &RHS) const {
return BB == RHS.BB && BBIndex == RHS.BBIndex;
}
bool operator<(const MCInstInBBReference &RHS) const {
return std::tie(BB, BBIndex) < std::tie(RHS.BB, RHS.BBIndex);
}
operator MCInst &() const {
assert(BB != nullptr);
return BB->getInstructionAtIndex(BBIndex);
}
uint64_t getAddress() const {
// 4 bytes per instruction on AArch64.
// FIXME: the assumption of 4 byte per instruction needs to be fixed before
// this method gets used on any non-AArch64 binaries (but should be fine for
// pac-ret analysis, as that is an AArch64-specific feature).
return BB->getFunction()->getAddress() + BB->getOffset() + BBIndex * 4;
}
};

raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &);

struct MCInstInBFReference {
BinaryFunction *BF;
uint64_t Offset;
MCInstInBFReference(BinaryFunction *BF, uint64_t Offset)
: BF(BF), Offset(Offset) {}

static MCInstInBFReference get(const MCInst *Inst, BinaryFunction &BF) {
for (auto &I : BF.instrs())
if (Inst == &I.second)
return MCInstInBFReference(&BF, I.first);
return {};
}

MCInstInBFReference() : BF(nullptr), Offset(0) {}
bool operator==(const MCInstInBFReference &RHS) const {
return BF == RHS.BF && Offset == RHS.Offset;
}
bool operator<(const MCInstInBFReference &RHS) const {
return std::tie(BF, Offset) < std::tie(RHS.BF, RHS.Offset);
}
operator MCInst &() const {
assert(BF != nullptr);
return *BF->getInstructionAtOffset(Offset);
}

uint64_t getOffset() const { return Offset; }

uint64_t getAddress() const { return BF->getAddress() + getOffset(); }
};

raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &);

struct MCInstReference {
enum Kind { FunctionParent, BasicBlockParent };
Kind ParentKind;
union U {
MCInstInBBReference BBRef;
MCInstInBFReference BFRef;
U(MCInstInBBReference BBRef) : BBRef(BBRef) {}
U(MCInstInBFReference BFRef) : BFRef(BFRef) {}
} U;
MCInstReference(MCInstInBBReference BBRef)
: ParentKind(BasicBlockParent), U(BBRef) {}
MCInstReference(MCInstInBFReference BFRef)
: ParentKind(FunctionParent), U(BFRef) {}
MCInstReference(BinaryBasicBlock *BB, int64_t BBIndex)
: MCInstReference(MCInstInBBReference(BB, BBIndex)) {}
MCInstReference(BinaryFunction *BF, uint32_t Offset)
: MCInstReference(MCInstInBFReference(BF, Offset)) {}

static MCInstReference get(const MCInst *Inst, BinaryFunction &BF) {
if (BF.hasCFG())
return MCInstInBBReference::get(Inst, BF);
return MCInstInBFReference::get(Inst, BF);
}

bool operator<(const MCInstReference &RHS) const {
if (ParentKind != RHS.ParentKind)
return ParentKind < RHS.ParentKind;
switch (ParentKind) {
case BasicBlockParent:
return U.BBRef < RHS.U.BBRef;
case FunctionParent:
return U.BFRef < RHS.U.BFRef;
}
llvm_unreachable("");
}

bool operator==(const MCInstReference &RHS) const {
if (ParentKind != RHS.ParentKind)
return false;
switch (ParentKind) {
case BasicBlockParent:
return U.BBRef == RHS.U.BBRef;
case FunctionParent:
return U.BFRef == RHS.U.BFRef;
}
llvm_unreachable("");
}

operator MCInst &() const {
switch (ParentKind) {
case BasicBlockParent:
return U.BBRef;
case FunctionParent:
return U.BFRef;
}
llvm_unreachable("");
}

operator bool() const {
switch (ParentKind) {
case BasicBlockParent:
return U.BBRef.BB != nullptr;
case FunctionParent:
return U.BFRef.BF != nullptr;
}
llvm_unreachable("");
}

uint64_t getAddress() const {
switch (ParentKind) {
case BasicBlockParent:
return U.BBRef.getAddress();
case FunctionParent:
return U.BFRef.getAddress();
}
llvm_unreachable("");
}

BinaryFunction *getFunction() const {
switch (ParentKind) {
case FunctionParent:
return U.BFRef.BF;
case BasicBlockParent:
return U.BBRef.BB->getFunction();
}
llvm_unreachable("");
}

BinaryBasicBlock *getBasicBlock() const {
switch (ParentKind) {
case FunctionParent:
return nullptr;
case BasicBlockParent:
return U.BBRef.BB;
}
llvm_unreachable("");
}
};

raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &);

namespace PAuthGadgetScanner {

// The report classes are designed to be used in an immutable manner.
Expand Down
1 change: 1 addition & 0 deletions bolt/lib/Core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ add_llvm_library(LLVMBOLTCore
GDBIndex.cpp
HashUtilities.cpp
JumpTable.cpp
MCInstUtils.cpp
MCPlusBuilder.cpp
ParallelUtilities.cpp
Relocation.cpp
Expand Down
Loading
Loading