diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index f7cf538bd0e86..50d652a8bc677 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1412,13 +1412,14 @@ class MCPlusBuilder {
     return false;
   }
 
-  /// Modify a direct call instruction \p Inst with an indirect call taking
-  /// a destination from a memory location pointed by \p TargetLocation symbol.
-  virtual bool convertCallToIndirectCall(MCInst &Inst,
-                                         const MCSymbol *TargetLocation,
-                                         MCContext *Ctx) {
+  /// Creates an indirect call to the function within the \p DirectCall PLT
+  /// stub. The function's memory location is pointed by the \p TargetLocation
+  /// symbol.
+  virtual InstructionListType
+  createIndirectPltCall(const MCInst &DirectCall,
+                        const MCSymbol *TargetLocation, MCContext *Ctx) {
     llvm_unreachable("not implemented");
-    return false;
+    return {};
   }
 
   /// Morph an indirect call into a load where \p Reg holds the call target.
diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp
index d0276f22e14ef..2ed996fadbb99 100644
--- a/bolt/lib/Passes/PLTCall.cpp
+++ b/bolt/lib/Passes/PLTCall.cpp
@@ -48,8 +48,8 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) {
     return Error::success();
 
   uint64_t NumCallsOptimized = 0;
-  for (auto &It : BC.getBinaryFunctions()) {
-    BinaryFunction &Function = It.second;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
     if (!shouldOptimize(Function))
       continue;
 
@@ -61,18 +61,21 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) {
       if (opts::PLT == OT_HOT && !BB.getKnownExecutionCount())
         continue;
 
-      for (MCInst &Instr : BB) {
-        if (!BC.MIB->isCall(Instr))
+      for (auto II = BB.begin(); II != BB.end(); II++) {
+        if (!BC.MIB->isCall(*II))
           continue;
-        const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr);
+        const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(*II);
         if (!CallSymbol)
           continue;
         const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CallSymbol);
         if (!CalleeBF || !CalleeBF->isPLTFunction())
           continue;
-        BC.MIB->convertCallToIndirectCall(Instr, CalleeBF->getPLTSymbol(),
-                                          BC.Ctx.get());
-        BC.MIB->addAnnotation(Instr, "PLTCall", true);
+        const InstructionListType NewCode = BC.MIB->createIndirectPltCall(
+            *II, CalleeBF->getPLTSymbol(), BC.Ctx.get());
+        II = BB.replaceInstruction(II, NewCode);
+        assert(!NewCode.empty() && "PLT Call replacement must be non-empty");
+        std::advance(II, NewCode.size() - 1);
+        BC.MIB->addAnnotation(*II, "PLTCall", true);
         ++NumCallsOptimized;
       }
     }
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 0ae9d3668b93b..9590629f9b344 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1055,6 +1055,47 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
+  InstructionListType createIndirectPltCall(const MCInst &DirectCall,
+                                            const MCSymbol *TargetLocation,
+                                            MCContext *Ctx) override {
+    const bool IsTailCall = isTailCall(DirectCall);
+    assert((DirectCall.getOpcode() == AArch64::BL ||
+            (DirectCall.getOpcode() == AArch64::B && IsTailCall)) &&
+           "64-bit direct (tail) call instruction expected");
+
+    InstructionListType Code;
+    // Code sequence for indirect plt call:
+    // adrp	x16 <symbol>
+    // ldr	x17, [x16, #<offset>]
+    // blr	x17  ; or 'br' for tail calls
+
+    MCInst InstAdrp;
+    InstAdrp.setOpcode(AArch64::ADRP);
+    InstAdrp.addOperand(MCOperand::createReg(AArch64::X16));
+    InstAdrp.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstAdrp, /* OpNum */ 1, TargetLocation,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_ADR_GOT_PAGE);
+    Code.emplace_back(InstAdrp);
+
+    MCInst InstLoad;
+    InstLoad.setOpcode(AArch64::LDRXui);
+    InstLoad.addOperand(MCOperand::createReg(AArch64::X17));
+    InstLoad.addOperand(MCOperand::createReg(AArch64::X16));
+    InstLoad.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstLoad, /* OpNum */ 2, TargetLocation,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_LD64_GOT_LO12_NC);
+    Code.emplace_back(InstLoad);
+
+    MCInst InstCall;
+    InstCall.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR);
+    InstCall.addOperand(MCOperand::createReg(AArch64::X17));
+    if (IsTailCall)
+      setTailCall(InstCall);
+    Code.emplace_back(InstCall);
+
+    return Code;
+  }
+
   bool lowerTailCall(MCInst &Inst) override {
     removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
     if (getConditionalTailCall(Inst))
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index a33a9dc8c013c..792129544b7bb 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1639,11 +1639,16 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool convertCallToIndirectCall(MCInst &Inst, const MCSymbol *TargetLocation,
-                                 MCContext *Ctx) override {
-    assert((Inst.getOpcode() == X86::CALL64pcrel32 ||
-            (Inst.getOpcode() == X86::JMP_4 && isTailCall(Inst))) &&
+  InstructionListType createIndirectPltCall(const MCInst &DirectCall,
+                                            const MCSymbol *TargetLocation,
+                                            MCContext *Ctx) override {
+    assert((DirectCall.getOpcode() == X86::CALL64pcrel32 ||
+            (DirectCall.getOpcode() == X86::JMP_4 && isTailCall(DirectCall))) &&
            "64-bit direct (tail) call instruction expected");
+
+    InstructionListType Code;
+    // Create a new indirect call by converting the previous direct call.
+    MCInst Inst = DirectCall;
     const auto NewOpcode =
         (Inst.getOpcode() == X86::CALL64pcrel32) ? X86::CALL64m : X86::JMP32m;
     Inst.setOpcode(NewOpcode);
@@ -1664,7 +1669,8 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     Inst.insert(Inst.begin(),
                 MCOperand::createReg(X86::RIP));        // BaseReg
 
-    return true;
+    Code.emplace_back(Inst);
+    return Code;
   }
 
   void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override {
diff --git a/bolt/test/AArch64/plt-call.test b/bolt/test/AArch64/plt-call.test
new file mode 100644
index 0000000000000..da307d4a6c01e
--- /dev/null
+++ b/bolt/test/AArch64/plt-call.test
@@ -0,0 +1,15 @@
+// Verify that PLTCall optimization works.
+
+RUN: %clang %cflags %p/../Inputs/plt-tailcall.c \
+RUN:    -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --plt=all --print-plt  --print-only=foo | FileCheck %s
+
+// Call to printf
+CHECK: adrp	x16, printf@GOT
+CHECK: ldr	x17, [x16, :lo12:printf@GOT]
+CHECK: blr	x17 # PLTCall: 1
+
+// Call to puts, that was tail-call optimized
+CHECK: adrp	x16, puts@GOT
+CHECK: ldr	x17, [x16, :lo12:puts@GOT]
+CHECK: br	x17 # TAILCALL  # PLTCall: 1
diff --git a/bolt/test/Inputs/plt-tailcall.c b/bolt/test/Inputs/plt-tailcall.c
new file mode 100644
index 0000000000000..13f6e29c60774
--- /dev/null
+++ b/bolt/test/Inputs/plt-tailcall.c
@@ -0,0 +1,8 @@
+#include "stub.h"
+
+int foo(char *c) {
+  printf("");
+  __attribute__((musttail)) return puts(c);
+}
+
+int main() { return foo("a"); }
diff --git a/bolt/test/X86/plt-call.test b/bolt/test/X86/plt-call.test
new file mode 100644
index 0000000000000..e6ae86c179d27
--- /dev/null
+++ b/bolt/test/X86/plt-call.test
@@ -0,0 +1,11 @@
+// Verify that PLTCall optimization works.
+
+RUN: %clang %cflags %p/../Inputs/plt-tailcall.c \
+RUN:    -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --plt=all --print-plt  --print-only=foo | FileCheck %s
+
+// Call to printf
+CHECK: callq *printf@GOT(%rip) # PLTCall: 1
+
+// Call to puts, that was tail-call optimized
+CHECK: jmpl *puts@GOT(%rip) # TAILCALL # PLTCall: 1