Skip to content

Commit aa2e1d5

Browse files
PavelKopylakiramenai
authored andcommitted
[EraVM] Add the disassembler C-API.
This API accepts just a bare EraVM byte code (not in the ELF format). As a result, we employ heuristic-base approach to understand the byte code structure. In particular, to understand where a section with constants begins, is the given byte octet an instruction, padding, or metadata. This does not work in general case, so a user may need to check hex encoding and consult with the EraVM binary layout, to fully understand the byte code structure.
1 parent de03eeb commit aa2e1d5

File tree

8 files changed

+349
-5
lines changed

8 files changed

+349
-5
lines changed

llvm/include/llvm-c/Assembler.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,19 @@ LLVMBool LLVMAssembleEraVM(LLVMTargetMachineRef T, LLVMMemoryBufferRef InBuffer,
3232
LLVMBool LLVMExceedsSizeLimitEraVM(LLVMMemoryBufferRef MemBuf,
3333
uint64_t MetadataSize);
3434

35+
/** Disassembles the bytecode passed in \p InBuffer starting at
36+
* the offset \p PC. The result is returned via \p OutBuffer.
37+
* In case of an error the function returns 'true' and an error
38+
* message is passes via \p ErrorMessage. The message should be disposed
39+
* by LLVMDisposeMessage. **/
40+
LLVMBool LLVMDisassembleEraVM(LLVMTargetMachineRef T,
41+
LLVMMemoryBufferRef InBuffer, uint64_t PC,
42+
uint64_t Options, LLVMMemoryBufferRef *OutBuffer,
43+
char **ErrorMessage);
44+
45+
/* The option to output offset and encoding of a instruction. */
46+
#define LLVMDisassemblerEraVM_Option_OutputEncoding 1
47+
3548
LLVM_C_EXTERN_C_END
3649

3750
#endif // LLVM_C_ASSEMBLER_H

llvm/lib/MC/MCC/AssemblerC.cpp

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,14 @@
1212

1313
#include "llvm-c/Assembler.h"
1414
#include "llvm-c/Core.h"
15+
#include "llvm-c/Disassembler.h"
1516
#include "llvm-c/Object.h"
17+
#include "llvm/ADT/APInt.h"
18+
#include "llvm/ADT/StringExtras.h"
1619
#include "llvm/MC/MCAsmBackend.h"
1720
#include "llvm/MC/MCCodeEmitter.h"
1821
#include "llvm/MC/MCContext.h"
22+
#include "llvm/MC/MCInstPrinter.h"
1923
#include "llvm/MC/MCObjectWriter.h"
2024
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
2125
#include "llvm/MC/MCStreamer.h"
@@ -24,9 +28,14 @@
2428
#include "llvm/Object/Binary.h"
2529
#include "llvm/Object/ObjectFile.h"
2630
#include "llvm/Support/MemoryBuffer.h"
31+
#include "llvm/Support/Regex.h"
2732
#include "llvm/Support/SourceMgr.h"
2833
#include "llvm/Target/TargetMachine.h"
2934

35+
#include <algorithm>
36+
#include <limits>
37+
#include <set>
38+
3039
using namespace llvm;
3140
using namespace object;
3241

@@ -181,3 +190,143 @@ LLVMBool LLVMExceedsSizeLimitEraVM(LLVMMemoryBufferRef MemBuf,
181190

182191
return false;
183192
}
193+
194+
static const char *symbolLookupCallback(void *DisInfo, uint64_t ReferenceValue,
195+
uint64_t *ReferenceType,
196+
uint64_t ReferencePC,
197+
const char **ReferenceName) {
198+
*ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
199+
return nullptr;
200+
}
201+
202+
LLVMBool LLVMDisassembleEraVM(LLVMTargetMachineRef T,
203+
LLVMMemoryBufferRef InBuffer, uint64_t PC,
204+
uint64_t Options, LLVMMemoryBufferRef *OutBuffer,
205+
char **ErrorMessage) {
206+
TargetMachine *TM = unwrap(T);
207+
const Triple &TheTriple = TM->getTargetTriple();
208+
constexpr size_t InstrSize = 8;
209+
constexpr size_t WordSize = 32;
210+
constexpr size_t OutStringSize = 1024;
211+
MemoryBuffer *InMemBuf = unwrap(InBuffer);
212+
const auto *Bytes =
213+
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
214+
reinterpret_cast<const uint8_t *>(InMemBuf->getBuffer().data());
215+
const size_t BytesNum = InMemBuf->getBufferSize();
216+
217+
if (PC > BytesNum) {
218+
*ErrorMessage = strdup("Starting address exceeds the bytecode size");
219+
return true;
220+
}
221+
222+
if (PC % InstrSize) {
223+
*ErrorMessage =
224+
strdup("Starting address isn't multiple of 8 (instruction size)");
225+
return true;
226+
}
227+
228+
if (BytesNum % WordSize) {
229+
*ErrorMessage = strdup("Bytecode size isn't multiple of 32 (word size)");
230+
return true;
231+
}
232+
233+
LLVMDisasmContextRef DCR = LLVMCreateDisasm(
234+
TheTriple.getTriple().c_str(), nullptr, 0, nullptr, symbolLookupCallback);
235+
assert(DCR && "Unable to create disassembler");
236+
237+
std::string Disassembly;
238+
raw_string_ostream OS(Disassembly);
239+
formatted_raw_ostream FOS(OS);
240+
bool ShoulOutputEncoding =
241+
Options & LLVMDisassemblerEraVM_Option_OutputEncoding;
242+
243+
auto PrintEncoding = [&FOS](uint64_t PC, ArrayRef<uint8_t> InstrBytes) {
244+
FOS << format("%8" PRIx64 ":", PC);
245+
FOS << ' ';
246+
dumpBytes(ArrayRef<uint8_t>(InstrBytes), FOS);
247+
};
248+
249+
// First, parse the section with instructions. Stop at the beginning
250+
// of the section with constants (if any).
251+
uint64_t ConstantSectionStart = std::numeric_limits<uint64_t>::max();
252+
Regex CodeRegex(R"(code\[(r[0-9]+\+)?([0-9]+)\])");
253+
bool FoundMetadata = false;
254+
while (PC < BytesNum) {
255+
std::array<uint8_t, InstrSize> InstrBytes{};
256+
std::memcpy(InstrBytes.data(), Bytes + PC, InstrSize);
257+
258+
if (ShoulOutputEncoding)
259+
PrintEncoding(PC, InstrBytes);
260+
261+
std::array<char, OutStringSize> OutString{};
262+
size_t NumRead =
263+
LLVMDisasmInstruction(DCR, InstrBytes.data(), InstrBytes.size(),
264+
/*PC=*/0, OutString.data(), OutString.size());
265+
266+
// We are inside the instructions section, i.e before the constants.
267+
// Figure out if the current octet is the real instruction, or a
268+
// zero-filled padding.
269+
if (!NumRead) {
270+
if (std::all_of(InstrBytes.begin(), InstrBytes.end(),
271+
[](uint8_t Byte) { return Byte == 0; })) {
272+
FOS << (FoundMetadata ? "\t<unknown>" : "\t<padding>");
273+
} else {
274+
FoundMetadata = true;
275+
FOS << "\t<metadata>";
276+
}
277+
} else {
278+
FOS << OutString.data();
279+
// Check if the instruction contains a code reference. If so,
280+
// extract the word number and add it to the WordRefs set.
281+
SmallVector<StringRef, 3> Matches;
282+
if (CodeRegex.match(OutString.data(), &Matches)) {
283+
uint64_t WordNum = 0;
284+
// Match Idx = 0 corresponds to whole pattern, Idx = 1
285+
// to an optional register and Idx = 2 to the displacement.
286+
to_integer<uint64_t>(Matches[2], WordNum, /*Base=*/10);
287+
ConstantSectionStart = std::min(ConstantSectionStart, WordNum);
288+
}
289+
}
290+
FOS << '\n';
291+
292+
PC += InstrSize;
293+
// If we are at the word boundary and the word is being referenced,
294+
// this is a beginning of the constant section, so break the cycle.
295+
if (!(PC % WordSize) && ConstantSectionStart == PC / WordSize)
296+
break;
297+
}
298+
299+
#ifndef NDEBUG
300+
if (ConstantSectionStart != std::numeric_limits<uint64_t>::max())
301+
assert(PC == ConstantSectionStart * WordSize);
302+
#endif
303+
304+
while (PC + WordSize <= BytesNum) {
305+
uint64_t Word = PC / WordSize;
306+
assert(PC % WordSize == 0);
307+
308+
// Emit the numeric label and the .cell directive.
309+
FOS << std::to_string(Word) << ":\n";
310+
FOS << "\t.cell ";
311+
312+
// Collect four octets constituting the word value.
313+
SmallVector<uint8_t, 32> CellBytes(
314+
llvm::make_range(Bytes + PC, Bytes + PC + WordSize));
315+
316+
// Emit the cell value as a signed integer.
317+
llvm::SmallString<WordSize> CellHexStr;
318+
llvm::toHex(llvm::ArrayRef<uint8_t>(CellBytes.data(), CellBytes.size()),
319+
/*LowerCase=*/false, CellHexStr);
320+
APInt CellInt(WordSize * 8, CellHexStr.str(), /*radix=*/16);
321+
CellInt.print(OS, /*isSigned=*/true);
322+
FOS << '\n';
323+
PC += WordSize;
324+
}
325+
assert(PC == BytesNum);
326+
327+
*OutBuffer = LLVMCreateMemoryBufferWithMemoryRangeCopy(
328+
Disassembly.data(), Disassembly.size(), "result");
329+
330+
LLVMDisasmDispose(DCR);
331+
return false;
332+
}

llvm/lib/MC/MCC/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMMCC
55
AsmParser
66
Core
77
MC
8+
MCDisassembler
89
MCParser
910
Object
1011
Support

llvm/lib/Target/EraVM/EraVMAsmPrinter.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,8 +469,11 @@ void EraVMAsmPrinter::emitStartOfAsmFile(Module &M) {
469469
// initializers. The checks for linkage and the presense of an initilizer
470470
// are mainly to pass target independent LLVM IR tests.
471471
// The 'constant' globals go to .rodata section, so skip them.
472+
// Only variables with the default address space (AS_STACK) should be
473+
// initialized this way.
472474
if ((G.getLinkage() != GlobalValue::AvailableExternallyLinkage) &&
473-
!G.isConstant() && G.hasInitializer())
475+
!G.isConstant() && G.hasInitializer() &&
476+
G.getAddressSpace() == EraVMAS::AS_STACK)
474477
NumStackElmsToReserve += createInitializeInsts(&G, InitInsts);
475478
}
476479

llvm/lib/Target/EraVM/EraVMInstrInfo.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -953,8 +953,8 @@ def THROW : Pseudo<(outs), (ins GR256:$rs0, pred:$cc), [(EraVMthrow GR256:$rs0)]
953953
def : InstAlias<"ret${cc}", (RETr R1, pred:$cc)>;
954954
def : InstAlias<"rev${cc}", (REVERTr R1, pred:$cc)>;
955955

956-
def : InstAlias<"retl${cc} $dest", (RETrl R1, jmptarget:$dest, pred:$cc)>;
957-
def : InstAlias<"revl${cc} $dest", (REVERTrl R1, jmptarget:$dest, pred:$cc)>;
956+
def : InstAlias<"retl${cc}\t$dest", (RETrl R1, jmptarget:$dest, pred:$cc)>;
957+
def : InstAlias<"revl${cc}\t$dest", (REVERTrl R1, jmptarget:$dest, pred:$cc)>;
958958

959959
def : Pat<(EraVMreturn GR256:$rs0), (RETrl GR256:$rs0, (default_far_return 0))>;
960960
def : Pat<(EraVMrevert GR256:$rs0), (REVERTrl GR256:$rs0, (default_far_revert 0))>;
@@ -1052,7 +1052,7 @@ foreach Op = [OpFarcall, OpDelegate, OpMimic] in {
10521052
!strconcat(Op.Name, ".st.sh")>;
10531053
}
10541054

1055-
def : InstAlias<"call${cc} ${callee}",
1055+
def : InstAlias<"call${cc}\t${callee}",
10561056
(NEAR_CALL_default_unwind R0, jmptarget:$callee, pred:$cc)>;
10571057

10581058
def : Pat<(EraVMfarcall bb:$unwind), (FAR_CALLrrl R1, R2, bb:$unwind)>;

llvm/test/CodeGen/EraVM/global_initializers.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ target triple = "eravm"
1010
@glob.const = constant i256 737
1111

1212
; CHECK-LABEL: .text
13-
; CHECK-NEXT: incsp 10
13+
; CHECK-NEXT: incsp 6
1414
; CHECK-NEXT: add code[@glob_initializer_0], r0, stack[@glob]
1515
; CHECK-NEXT: add code[@glob.arr_initializer_1], r0, stack[@glob.arr + 1]
1616
; CHECK-NEXT: add code[@glob.arr_initializer_3], r0, stack[@glob.arr + 3]

0 commit comments

Comments
 (0)