Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 2 Feb 2015 20:36:17 +0000 (UTC)
From:      Dimitry Andric <dim@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r278113 - projects/clang360-import/contrib/llvm/patches
Message-ID:  <201502022036.t12KaHbD075043@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: dim
Date: Mon Feb  2 20:36:16 2015
New Revision: 278113
URL: https://svnweb.freebsd.org/changeset/base/278113

Log:
  Add the llvm patch corresponding to r278112.

Added:
  projects/clang360-import/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff

Added: projects/clang360-import/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/clang360-import/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff	Mon Feb  2 20:36:16 2015	(r278113)
@@ -0,0 +1,1271 @@
+Pull in r227752 from upstream llvm trunk (by Michael Kuperstein):
+
+  [X86] Convert esp-relative movs of function arguments to pushes, step 2
+
+  This moves the transformation introduced in r223757 into a separate MI pass.
+  This allows it to cover many more cases (not only cases where there must be a 
+  reserved call frame), and perform rudimentary call folding. It still doesn't 
+  have a heuristic, so it is enabled only for optsize/minsize, with stack 
+  alignment <= 8, where it ought to be a fairly clear win.
+
+  (Re-commit of r227728)
+
+  Differential Revision: http://reviews.llvm.org/D6789
+
+This helps to get sys/boot/i386/boot2 below the required size again,
+when optimizing with -Oz.
+
+Introduced here: http://svnweb.freebsd.org/changeset/base/278112
+
+Index: include/llvm/Target/TargetFrameLowering.h
+===================================================================
+--- include/llvm/Target/TargetFrameLowering.h
++++ include/llvm/Target/TargetFrameLowering.h
+@@ -193,6 +193,11 @@ class TargetFrameLowering {
+     return hasReservedCallFrame(MF) || hasFP(MF);
+   }
+ 
++  // needsFrameIndexResolution - Do we need to perform FI resolution for
++  // this function. Normally, this is required only when the function
++  // has any stack objects. However, targets may want to override this.
++  virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;
++
+   /// getFrameIndexOffset - Returns the displacement from the frame register to
+   /// the stack frame of the specified index.
+   virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+Index: lib/CodeGen/PrologEpilogInserter.cpp
+===================================================================
+--- lib/CodeGen/PrologEpilogInserter.cpp
++++ lib/CodeGen/PrologEpilogInserter.cpp
+@@ -703,7 +703,8 @@ void PEI::insertPrologEpilogCode(MachineFunction &
+ /// register references and actual offsets.
+ ///
+ void PEI::replaceFrameIndices(MachineFunction &Fn) {
+-  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
++  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
++  if (!TFI.needsFrameIndexResolution(Fn)) return;
+ 
+   // Store SPAdj at exit of a basic block.
+   SmallVector<int, 8> SPState;
+@@ -769,13 +770,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B
+       continue;
+     }
+ 
+-    // If we are looking at a call sequence, we need to keep track of
+-    // the SP adjustment made by each instruction in the sequence.
+-    // This includes both the frame setup/destroy pseudos (handled above),
+-    // as well as other instructions that have side effects w.r.t the SP.
+-    if (InsideCallSequence)
+-      SPAdj += TII.getSPAdjust(I);
+-
+     MachineInstr *MI = I;
+     bool DoIncr = true;
+     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+@@ -854,6 +848,16 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B
+       break;
+     }
+ 
++    // If we are looking at a call sequence, we need to keep track of
++    // the SP adjustment made by each instruction in the sequence.
++    // This includes both the frame setup/destroy pseudos (handled above),
++    // as well as other instructions that have side effects w.r.t the SP.
++    // Note that this must come after eliminateFrameIndex, because 
++    // if I itself referred to a frame index, we shouldn't count its own
++    // adjustment.
++    if (MI && InsideCallSequence)
++      SPAdj += TII.getSPAdjust(MI);
++
+     if (DoIncr && I != BB->end()) ++I;
+ 
+     // Update register states.
+Index: lib/CodeGen/TargetFrameLoweringImpl.cpp
+===================================================================
+--- lib/CodeGen/TargetFrameLoweringImpl.cpp
++++ lib/CodeGen/TargetFrameLoweringImpl.cpp
+@@ -42,3 +42,8 @@ int TargetFrameLowering::getFrameIndexReference(co
+   FrameReg = RI->getFrameRegister(MF);
+   return getFrameIndexOffset(MF, FI);
+ }
++
++bool TargetFrameLowering::needsFrameIndexResolution(
++    const MachineFunction &MF) const {
++  return MF.getFrameInfo()->hasStackObjects();
++}
+Index: lib/Target/X86/CMakeLists.txt
+===================================================================
+--- lib/Target/X86/CMakeLists.txt
++++ lib/Target/X86/CMakeLists.txt
+@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen)
+ 
+ set(sources
+   X86AsmPrinter.cpp
++  X86CallFrameOptimization.cpp
+   X86FastISel.cpp
+   X86FloatingPoint.cpp
+   X86FrameLowering.cpp
+Index: lib/Target/X86/X86.h
+===================================================================
+--- lib/Target/X86/X86.h
++++ lib/Target/X86/X86.h
+@@ -67,6 +67,11 @@ FunctionPass *createX86PadShortFunctions();
+ /// to eliminate execution delays in some Atom processors.
+ FunctionPass *createX86FixupLEAs();
+ 
++/// createX86CallFrameOptimization - Return a pass that optimizes
++/// the code-size of x86 call sequences. This is done by replacing
++/// esp-relative movs with pushes.
++FunctionPass *createX86CallFrameOptimization();
++
+ } // End llvm namespace
+ 
+ #endif
+Index: lib/Target/X86/X86CallFrameOptimization.cpp
+===================================================================
+--- lib/Target/X86/X86CallFrameOptimization.cpp
++++ lib/Target/X86/X86CallFrameOptimization.cpp
+@@ -0,0 +1,400 @@
++//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This file defines a pass that optimizes call sequences on x86.
++// Currently, it converts movs of function parameters onto the stack into 
++// pushes. This is beneficial for two main reasons:
++// 1) The push instruction encoding is much smaller than an esp-relative mov
++// 2) It is possible to push memory arguments directly. So, if the
++//    the transformation is preformed pre-reg-alloc, it can help relieve
++//    register pressure.
++//
++//===----------------------------------------------------------------------===//
++
++#include <algorithm>
++
++#include "X86.h"
++#include "X86InstrInfo.h"
++#include "X86Subtarget.h"
++#include "X86MachineFunctionInfo.h"
++#include "llvm/ADT/Statistic.h"
++#include "llvm/CodeGen/MachineFunctionPass.h"
++#include "llvm/CodeGen/MachineInstrBuilder.h"
++#include "llvm/CodeGen/MachineRegisterInfo.h"
++#include "llvm/CodeGen/Passes.h"
++#include "llvm/IR/Function.h"
++#include "llvm/Support/Debug.h"
++#include "llvm/Support/raw_ostream.h"
++#include "llvm/Target/TargetInstrInfo.h"
++
++using namespace llvm;
++
++#define DEBUG_TYPE "x86-cf-opt"
++
++cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
++              cl::desc("Avoid optimizing x86 call frames for size"),
++              cl::init(false), cl::Hidden);
++
++namespace {
++class X86CallFrameOptimization : public MachineFunctionPass {
++public:
++  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
++
++  bool runOnMachineFunction(MachineFunction &MF) override;
++
++private:
++  bool shouldPerformTransformation(MachineFunction &MF);
++
++  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
++                          MachineBasicBlock::iterator I);
++
++  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
++                                   unsigned Reg);
++
++  const char *getPassName() const override {
++    return "X86 Optimize Call Frame";
++  }
++
++  const TargetInstrInfo *TII;
++  const TargetFrameLowering *TFL;
++  const MachineRegisterInfo *MRI;
++  static char ID;
++};
++
++char X86CallFrameOptimization::ID = 0;
++}
++
++FunctionPass *llvm::createX86CallFrameOptimization() {
++  return new X86CallFrameOptimization();
++}
++
++// This checks whether the transformation is legal and profitable
++bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
++  if (NoX86CFOpt.getValue())
++    return false;
++
++  // We currently only support call sequences where *all* parameters.
++  // are passed on the stack.
++  // No point in running this in 64-bit mode, since some arguments are
++  // passed in-register in all common calling conventions, so the pattern
++  // we're looking for will never match.
++  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
++  if (STI.is64Bit())
++    return false;
++
++  // You would expect straight-line code between call-frame setup and
++  // call-frame destroy. You would be wrong. There are circumstances (e.g.
++  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
++  // end up with the setup and the destroy in different basic blocks.
++  // This is bad, and breaks SP adjustment.
++  // So, check that all of the frames in the function are closed inside
++  // the same block, and, for good measure, that there are no nested frames.
++  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
++  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
++  for (MachineBasicBlock &BB : MF) {
++    bool InsideFrameSequence = false;
++    for (MachineInstr &MI : BB) {
++      if (MI.getOpcode() == FrameSetupOpcode) {
++        if (InsideFrameSequence)
++          return false;
++        InsideFrameSequence = true;
++      }
++      else if (MI.getOpcode() == FrameDestroyOpcode) {
++        if (!InsideFrameSequence)
++          return false;
++        InsideFrameSequence = false;
++      }
++    }
++
++    if (InsideFrameSequence)
++      return false;
++  }
++
++  // Now that we know the transformation is legal, check if it is
++  // profitable.
++  // TODO: Add a heuristic that actually looks at the function,
++  //       and enable this for more cases.
++
++  // This transformation is always a win when we expected to have
++  // a reserved call frame. Under other circumstances, it may be either 
++  // a win or a loss, and requires a heuristic.
++  // For now, enable it only for the relatively clear win cases.
++  bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
++  if (CannotReserveFrame)
++    return true;
++
++  // For now, don't even try to evaluate the profitability when
++  // not optimizing for size.
++  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
++  bool OptForSize =
++    FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
++    Attribute::OptimizeForSize) ||
++    FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
++
++  if (!OptForSize)
++    return false;
++
++  // Stack re-alignment can make this unprofitable even in terms of size.
++  // As mentioned above, a better heuristic is needed. For now, don't do this
++  // when the required alignment is above 8. (4 would be the safe choice, but
++  // some experimentation showed 8 is generally good).
++  if (TFL->getStackAlignment() > 8)
++    return false;
++
++  return true;
++}
++
++bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
++  TII = MF.getSubtarget().getInstrInfo();
++  TFL = MF.getSubtarget().getFrameLowering();
++  MRI = &MF.getRegInfo();
++
++  if (!shouldPerformTransformation(MF))
++    return false;
++
++  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
++
++  bool Changed = false;
++
++  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
++    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
++      if (I->getOpcode() == FrameSetupOpcode)
++        Changed |= adjustCallSequence(MF, *BB, I);
++
++  return Changed;
++}
++
++bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
++                                                MachineBasicBlock &MBB,
++                                                MachineBasicBlock::iterator I) {
++
++  // Check that this particular call sequence is amenable to the
++  // transformation.
++  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
++                                       MF.getSubtarget().getRegisterInfo());
++  unsigned StackPtr = RegInfo.getStackRegister();
++  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
++
++  // We expect to enter this at the beginning of a call sequence
++  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
++  MachineBasicBlock::iterator FrameSetup = I++;
++
++  
++  // For globals in PIC mode, we can have some LEAs here.
++  // Ignore them, they don't bother us.
++  // TODO: Extend this to something that covers more cases.
++  while (I->getOpcode() == X86::LEA32r)
++    ++I;
++  
++  // We expect a copy instruction here.
++  // TODO: The copy instruction is a lowering artifact.
++  //       We should also support a copy-less version, where the stack
++  //       pointer is used directly.
++  if (!I->isCopy() || !I->getOperand(0).isReg())
++    return false;
++  MachineBasicBlock::iterator SPCopy = I++;
++  StackPtr = SPCopy->getOperand(0).getReg();
++
++  // Scan the call setup sequence for the pattern we're looking for.
++  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
++  // instructions, that push a sequence of 32-bit values onto the stack, with
++  // no gaps between them.
++  SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
++  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
++  if (MaxAdjust > 4)
++    MovVector.resize(MaxAdjust, nullptr);
++
++  do {
++    int Opcode = I->getOpcode();
++    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
++      break;
++
++    // We only want movs of the form:
++    // movl imm/r32, k(%esp)
++    // If we run into something else, bail.
++    // Note that AddrBaseReg may, counter to its name, not be a register,
++    // but rather a frame index.
++    // TODO: Support the fi case. This should probably work now that we
++    // have the infrastructure to track the stack pointer within a call
++    // sequence.
++    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
++        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
++        !I->getOperand(X86::AddrScaleAmt).isImm() ||
++        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
++        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
++        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
++        !I->getOperand(X86::AddrDisp).isImm())
++      return false;
++
++    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
++    assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
++
++    // We really don't want to consider the unaligned case.
++    if (StackDisp % 4)
++      return false;
++    StackDisp /= 4;
++
++    assert((size_t)StackDisp < MovVector.size() &&
++      "Function call has more parameters than the stack is adjusted for.");
++
++    // If the same stack slot is being filled twice, something's fishy.
++    if (MovVector[StackDisp] != nullptr)
++      return false;
++    MovVector[StackDisp] = I;
++
++    ++I;
++  } while (I != MBB.end());
++
++  // We now expect the end of the sequence - a call and a stack adjust.
++  if (I == MBB.end())
++    return false;
++
++  // For PCrel calls, we expect an additional COPY of the basereg.
++  // If we find one, skip it.
++  if (I->isCopy()) {
++    if (I->getOperand(1).getReg() ==
++      MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
++      ++I;
++    else
++      return false;
++  }
++
++  if (!I->isCall())
++    return false;
++  MachineBasicBlock::iterator Call = I;
++  if ((++I)->getOpcode() != FrameDestroyOpcode)
++    return false;
++
++  // Now, go through the vector, and see that we don't have any gaps,
++  // but only a series of 32-bit MOVs.
++  
++  int64_t ExpectedDist = 0;
++  auto MMI = MovVector.begin(), MME = MovVector.end();
++  for (; MMI != MME; ++MMI, ExpectedDist += 4)
++    if (*MMI == nullptr)
++      break;
++  
++  // If the call had no parameters, do nothing
++  if (!ExpectedDist)
++    return false;
++
++  // We are either at the last parameter, or a gap. 
++  // Make sure it's not a gap
++  for (; MMI != MME; ++MMI)
++    if (*MMI != nullptr)
++      return false;
++
++  // Ok, we can in fact do the transformation for this call.
++  // Do not remove the FrameSetup instruction, but adjust the parameters.
++  // PEI will end up finalizing the handling of this.
++  FrameSetup->getOperand(1).setImm(ExpectedDist);
++
++  DebugLoc DL = I->getDebugLoc();
++  // Now, iterate through the vector in reverse order, and replace the movs
++  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 
++  // replace uses.
++  for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
++    MachineBasicBlock::iterator MOV = *MovVector[Idx];
++    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
++    if (MOV->getOpcode() == X86::MOV32mi) {
++      unsigned PushOpcode = X86::PUSHi32;
++      // If the operand is a small (8-bit) immediate, we can use a
++      // PUSH instruction with a shorter encoding.
++      // Note that isImm() may fail even though this is a MOVmi, because
++      // the operand can also be a symbol.
++      if (PushOp.isImm()) {
++        int64_t Val = PushOp.getImm();
++        if (isInt<8>(Val))
++          PushOpcode = X86::PUSH32i8;
++      }
++      BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
++    } else {
++      unsigned int Reg = PushOp.getReg();
++
++      // If PUSHrmm is not slow on this target, try to fold the source of the
++      // push into the instruction.
++      const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
++      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
++
++      // Check that this is legal to fold. Right now, we're extremely
++      // conservative about that.
++      MachineInstr *DefMov = nullptr;
++      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
++        MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
++
++        unsigned NumOps = DefMov->getDesc().getNumOperands();
++        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
++          Push->addOperand(DefMov->getOperand(i));
++
++        DefMov->eraseFromParent();
++      } else {
++        BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
++      }
++    }
++
++    MBB.erase(MOV);
++  }
++
++  // The stack-pointer copy is no longer used in the call sequences.
++  // There should not be any other users, but we can't commit to that, so:
++  if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
++    SPCopy->eraseFromParent();
++
++  // Once we've done this, we need to make sure PEI doesn't assume a reserved
++  // frame.
++  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
++  FuncInfo->setHasPushSequences(true);
++
++  return true;
++}
++
++MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
++    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
++  // Do an extremely restricted form of load folding.
++  // ISel will often create patterns like:
++  // movl    4(%edi), %eax
++  // movl    8(%edi), %ecx
++  // movl    12(%edi), %edx
++  // movl    %edx, 8(%esp)
++  // movl    %ecx, 4(%esp)
++  // movl    %eax, (%esp)
++  // call
++  // Get rid of those with prejudice.
++  if (!TargetRegisterInfo::isVirtualRegister(Reg))
++    return nullptr;
++
++  // Make sure this is the only use of Reg.
++  if (!MRI->hasOneNonDBGUse(Reg))
++    return nullptr;
++
++  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
++
++  // Make sure the def is a MOV from memory.
++  // If the def is an another block, give up.
++  if (DefMI->getOpcode() != X86::MOV32rm ||
++      DefMI->getParent() != FrameSetup->getParent())
++    return nullptr;
++
++  // Be careful with movs that load from a stack slot, since it may get
++  // resolved incorrectly.
++  // TODO: Again, we already have the infrastructure, so this should work.
++  if (!DefMI->getOperand(1).isReg())
++    return nullptr;
++
++  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
++  // of MOVs. To be less conservative would require duplicating a lot of the
++  // logic from PeepholeOptimizer.
++  // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
++  // to be smarter about folding into pushes. 
++  for (auto I = DefMI; I != FrameSetup; ++I)
++    if (I->getOpcode() != X86::MOV32rm)
++      return nullptr;
++
++  return DefMI;
++}
+Index: lib/Target/X86/X86FastISel.cpp
+===================================================================
+--- lib/Target/X86/X86FastISel.cpp
++++ lib/Target/X86/X86FastISel.cpp
+@@ -2735,7 +2735,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &
+   // Issue CALLSEQ_START
+   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+-    .addImm(NumBytes);
++    .addImm(NumBytes).addImm(0);
+ 
+   // Walk the register/memloc assignments, inserting copies/loads.
+   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+Index: lib/Target/X86/X86FrameLowering.cpp
+===================================================================
+--- lib/Target/X86/X86FrameLowering.cpp
++++ lib/Target/X86/X86FrameLowering.cpp
+@@ -38,9 +38,36 @@ using namespace llvm;
+ extern cl::opt<bool> ForceStackAlign;
+ 
+ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+-  return !MF.getFrameInfo()->hasVarSizedObjects();
++  return !MF.getFrameInfo()->hasVarSizedObjects() &&
++         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+ }
+ 
++/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
++/// call frame pseudos can be simplified.  Having a FP, as in the default
++/// implementation, is not sufficient here since we can't always use it.
++/// Use a more nuanced condition.
++bool
++X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
++  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
++                               (MF.getSubtarget().getRegisterInfo());
++  return hasReservedCallFrame(MF) ||
++         (hasFP(MF) && !TRI->needsStackRealignment(MF))
++         || TRI->hasBasePointer(MF);
++}
++
++// needsFrameIndexResolution - Do we need to perform FI resolution for
++// this function. Normally, this is required only when the function
++// has any stack objects. However, FI resolution actually has another job,
++// not apparent from the title - it resolves callframesetup/destroy 
++// that were not simplified earlier.
++// So, this is required for x86 functions that have push sequences even
++// when there are no stack objects.
++bool
++X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
++  return MF.getFrameInfo()->hasStackObjects() ||
++         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
++}
++
+ /// hasFP - Return true if the specified function should have a dedicated frame
+ /// pointer register.  This is true if the function has variable sized allocas
+ /// or if frame pointer elimination is disabled.
+@@ -93,16 +120,6 @@ static unsigned getANDriOpcode(bool IsLP64, int64_
+   return X86::AND32ri;
+ }
+ 
+-static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) {
+-  // We don't support LP64 for now.
+-  assert(!IsLP64);
+-
+-  if (MO.isImm() && isInt<8>(MO.getImm()))
+-    return X86::PUSH32i8;
+-
+-  return X86::PUSHi32;;
+-}
+-
+ static unsigned getLEArOpcode(unsigned IsLP64) {
+   return IsLP64 ? X86::LEA64r : X86::LEA32r;
+ }
+@@ -1848,100 +1865,6 @@ void X86FrameLowering::adjustForHiPEPrologue(Machi
+ #endif
+ }
+ 
+-bool X86FrameLowering::
+-convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
+-                       MachineBasicBlock::iterator I, uint64_t Amount) const {
+-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+-  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+-    MF.getSubtarget().getRegisterInfo());
+-  unsigned StackPtr = RegInfo.getStackRegister();
+-
+-  // Scan the call setup sequence for the pattern we're looking for.
+-  // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
+-  // instructions, that push a sequence of 32-bit values onto the stack, with
+-  // no gaps.  
+-  std::map<int64_t, MachineBasicBlock::iterator> MovMap;
+-  do {
+-    int Opcode = I->getOpcode();
+-    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+-      break;
+- 
+-    // We only want movs of the form:
+-    // movl imm/r32, k(%ecx)
+-    // If we run into something else, bail
+-    // Note that AddrBaseReg may, counterintuitively, not be a register...
+-    if (!I->getOperand(X86::AddrBaseReg).isReg() || 
+-        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+-        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+-        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+-        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+-        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+-        !I->getOperand(X86::AddrDisp).isImm())
+-      return false;
+-
+-    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+-    
+-    // We don't want to consider the unaligned case.
+-    if (StackDisp % 4)
+-      return false;
+-
+-    // If the same stack slot is being filled twice, something's fishy.
+-    if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
+-      return false;
+-
+-    ++I;
+-  } while (I != MBB.end());
+-
+-  // We now expect the end of the sequence - a call and a stack adjust.
+-  if (I == MBB.end())
+-    return false;
+-  if (!I->isCall())
+-    return false;
+-  MachineBasicBlock::iterator Call = I;
+-  if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
+-    return false;
+-
+-  // Now, go through the map, and see that we don't have any gaps,
+-  // but only a series of 32-bit MOVs.
+-  // Since std::map provides ordered iteration, the original order
+-  // of the MOVs doesn't matter.
+-  int64_t ExpectedDist = 0;
+-  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; 
+-       ++MMI, ExpectedDist += 4)
+-    if (MMI->first != ExpectedDist)
+-      return false;
+-
+-  // Ok, everything looks fine. Do the transformation.
+-  DebugLoc DL = I->getDebugLoc();
+-
+-  // It's possible the original stack adjustment amount was larger than
+-  // that done by the pushes. If so, we still need a SUB.
+-  Amount -= ExpectedDist;
+-  if (Amount) {
+-    MachineInstr* Sub = BuildMI(MBB, Call, DL,
+-                          TII.get(getSUBriOpcode(false, Amount)), StackPtr)
+-                  .addReg(StackPtr).addImm(Amount);
+-    Sub->getOperand(3).setIsDead();
+-  }
+-
+-  // Now, iterate through the map in reverse order, and replace the movs
+-  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
+-  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
+-    MachineBasicBlock::iterator MOV = MMI->second;
+-    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+-
+-    // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size
+-    int PushOpcode = X86::PUSH32r;
+-    if (MOV->getOpcode() == X86::MOV32mi)
+-      PushOpcode = getPUSHiOpcode(false, PushOp);
+-
+-    BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp);
+-    MBB.erase(MOV);
+-  }
+-
+-  return true;
+-}
+-
+ void X86FrameLowering::
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I) const {
+@@ -1956,7 +1879,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
+   bool IsLP64 = STI.isTarget64BitLP64();
+   DebugLoc DL = I->getDebugLoc();
+   uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
+-  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
++  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
+   I = MBB.erase(I);
+ 
+   if (!reserveCallFrame) {
+@@ -1976,24 +1899,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
+     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+ 
+     MachineInstr *New = nullptr;
+-    if (Opcode == TII.getCallFrameSetupOpcode()) {
+-      // Try to convert movs to the stack into pushes.
+-      // We currently only look for a pattern that appears in 32-bit
+-      // calling conventions.
+-      if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
+-        return;
+ 
+-      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
+-                    StackPtr)
+-        .addReg(StackPtr)
+-        .addImm(Amount);
+-    } else {
+-      assert(Opcode == TII.getCallFrameDestroyOpcode());
++    // Factor out the amount that gets handled inside the sequence
++    // (Pushes of argument for frame setup, callee pops for frame destroy)
++    Amount -= InternalAmt;
+ 
+-      // Factor out the amount the callee already popped.
+-      Amount -= CalleeAmt;
++    if (Amount) {
++      if (Opcode == TII.getCallFrameSetupOpcode()) {
++        New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
++          .addReg(StackPtr).addImm(Amount);
++      } else {
++        assert(Opcode == TII.getCallFrameDestroyOpcode());
+ 
+-      if (Amount) {
+         unsigned Opc = getADDriOpcode(IsLP64, Amount);
+         New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+           .addReg(StackPtr).addImm(Amount);
+@@ -2011,13 +1928,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
+     return;
+   }
+ 
+-  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
++  if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
+     // If we are performing frame pointer elimination and if the callee pops
+     // something off the stack pointer, add it back.  We do this until we have
+     // more advanced stack pointer tracking ability.
+-    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
++    unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
+     MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+-      .addReg(StackPtr).addImm(CalleeAmt);
++      .addReg(StackPtr).addImm(InternalAmt);
+ 
+     // The EFLAGS implicit def is dead.
+     New->getOperand(3).setIsDead();
+Index: lib/Target/X86/X86FrameLowering.h
+===================================================================
+--- lib/Target/X86/X86FrameLowering.h
++++ lib/Target/X86/X86FrameLowering.h
+@@ -64,6 +64,8 @@ class X86FrameLowering : public TargetFrameLowerin
+ 
+   bool hasFP(const MachineFunction &MF) const override;
+   bool hasReservedCallFrame(const MachineFunction &MF) const override;
++  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
++  bool needsFrameIndexResolution(const MachineFunction &MF) const override;
+ 
+   int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+   int getFrameIndexReference(const MachineFunction &MF, int FI,
+Index: lib/Target/X86/X86InstrCompiler.td
+===================================================================
+--- lib/Target/X86/X86InstrCompiler.td
++++ lib/Target/X86/X86InstrCompiler.td
+@@ -43,9 +43,9 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses
+ // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+ // sub / add which can clobber EFLAGS.
+ let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
++def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "#ADJCALLSTACKDOWN",
+-                           [(X86callseq_start timm:$amt)]>,
++                           []>,
+                           Requires<[NotLP64]>;
+ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "#ADJCALLSTACKUP",
+@@ -52,7 +52,10 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins
+                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                           Requires<[NotLP64]>;
+ }
++def : Pat<(X86callseq_start timm:$amt1),
++          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+ 
++
+ // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+ // a stack adjustment and the codegen must know that they may modify the stack
+ // pointer before prolog-epilog rewriting occurs.
+@@ -59,9 +62,9 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins
+ // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+ // sub / add which can clobber EFLAGS.
+ let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
++def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "#ADJCALLSTACKDOWN",
+-                           [(X86callseq_start timm:$amt)]>,
++                           []>,
+                           Requires<[IsLP64]>;
+ def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "#ADJCALLSTACKUP",
+@@ -68,9 +71,10 @@ def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins
+                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                           Requires<[IsLP64]>;
+ }
++def : Pat<(X86callseq_start timm:$amt1),
++          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+ 
+ 
+-
+ // x86-64 va_start lowering magic.
+ let usesCustomInserter = 1, Defs = [EFLAGS] in {
+ def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+Index: lib/Target/X86/X86InstrInfo.cpp
+===================================================================
+--- lib/Target/X86/X86InstrInfo.cpp
++++ lib/Target/X86/X86InstrInfo.cpp
+@@ -1692,6 +1692,58 @@ X86InstrInfo::isCoalescableExtInstr(const MachineI
+   return false;
+ }
+ 
++int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
++  const MachineFunction *MF = MI->getParent()->getParent();
++  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
++
++  if (MI->getOpcode() == getCallFrameSetupOpcode() ||
++      MI->getOpcode() == getCallFrameDestroyOpcode()) {
++    unsigned StackAlign = TFI->getStackAlignment();
++    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * 
++                 StackAlign;
++
++    SPAdj -= MI->getOperand(1).getImm();
++
++    if (MI->getOpcode() == getCallFrameSetupOpcode())
++      return SPAdj;
++    else
++      return -SPAdj;
++  }
++  
++  // To know whether a call adjusts the stack, we need information 
++  // that is bound to the following ADJCALLSTACKUP pseudo.
++  // Look for the next ADJCALLSTACKUP that follows the call.
++  if (MI->isCall()) {
++    const MachineBasicBlock* MBB = MI->getParent();
++    auto I = ++MachineBasicBlock::const_iterator(MI);
++    for (auto E = MBB->end(); I != E; ++I) {
++      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
++          I->isCall())
++        break;
++    }
++
++    // If we could not find a frame destroy opcode, then it has already
++    // been simplified, so we don't care.
++    if (I->getOpcode() != getCallFrameDestroyOpcode())
++      return 0;
++
++    return -(I->getOperand(1).getImm());
++  }
++
++  // Currently handle only PUSHes we can reasonably expect to see
++  // in call sequences
++  switch (MI->getOpcode()) {
++  default: 
++    return 0;
++  case X86::PUSH32i8:
++  case X86::PUSH32r:
++  case X86::PUSH32rmm:
++  case X86::PUSH32rmr:
++  case X86::PUSHi32:
++    return 4;
++  }
++}
++
+ /// isFrameOperand - Return true and the FrameIndex if the specified
+ /// operand and follow operands form a reference to the stack frame.
+ bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+Index: lib/Target/X86/X86InstrInfo.h
+===================================================================
+--- lib/Target/X86/X86InstrInfo.h
++++ lib/Target/X86/X86InstrInfo.h
+@@ -175,6 +175,11 @@ class X86InstrInfo final : public X86GenInstrInfo
+   ///
+   const X86RegisterInfo &getRegisterInfo() const { return RI; }
+ 
++  /// getSPAdjust - This returns the stack pointer adjustment made by
++  /// this instruction. For x86, we need to handle more complex call
++  /// sequences involving PUSHes.
++  int getSPAdjust(const MachineInstr *MI) const override;
++
+   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+   /// extension instruction. That is, it's like a copy where it's legal for the
+   /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+Index: lib/Target/X86/X86MachineFunctionInfo.h
+===================================================================
+--- lib/Target/X86/X86MachineFunctionInfo.h
++++ lib/Target/X86/X86MachineFunctionInfo.h
+@@ -77,6 +77,9 @@ class X86MachineFunctionInfo : public MachineFunct
+   unsigned ArgumentStackSize;
+   /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+   unsigned NumLocalDynamics;
++  /// HasPushSequences - Keeps track of whether this function uses sequences
++  /// of pushes to pass function parameters.
++  bool HasPushSequences;
+ 
+ private:
+   /// ForwardedMustTailRegParms - A list of virtual and physical registers
+@@ -97,7 +100,8 @@ class X86MachineFunctionInfo : public MachineFunct
+                              VarArgsGPOffset(0),
+                              VarArgsFPOffset(0),
+                              ArgumentStackSize(0),
+-                             NumLocalDynamics(0) {}
++                             NumLocalDynamics(0),
++                             HasPushSequences(false) {}
+ 
+   explicit X86MachineFunctionInfo(MachineFunction &MF)
+     : ForceFramePointer(false),
+@@ -113,11 +117,15 @@ class X86MachineFunctionInfo : public MachineFunct
+       VarArgsGPOffset(0),
+       VarArgsFPOffset(0),
+       ArgumentStackSize(0),
+-      NumLocalDynamics(0) {}
++      NumLocalDynamics(0),
++      HasPushSequences(false) {}
+ 
+   bool getForceFramePointer() const { return ForceFramePointer;}
+   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+ 
++  bool getHasPushSequences() const { return HasPushSequences; }
++  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
++
+   bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
+   void setRestoreBasePointer(const MachineFunction *MF);
+   int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+Index: lib/Target/X86/X86RegisterInfo.cpp
+===================================================================
+--- lib/Target/X86/X86RegisterInfo.cpp
++++ lib/Target/X86/X86RegisterInfo.cpp
+@@ -468,8 +468,6 @@ void
+ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                      int SPAdj, unsigned FIOperandNum,
+                                      RegScavenger *RS) const {
+-  assert(SPAdj == 0 && "Unexpected");
+-
+   MachineInstr &MI = *II;
+   MachineFunction &MF = *MI.getParent()->getParent();
+   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+@@ -506,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicB
+   } else
+     FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+ 
++  if (BasePtr == StackPtr)
++    FIOffset += SPAdj;
++
+   // The frame index format for stackmaps and patchpoints is different from the
+   // X86 format. It only has a FI and an offset.
+   if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+Index: lib/Target/X86/X86TargetMachine.cpp
+===================================================================
+--- lib/Target/X86/X86TargetMachine.cpp
++++ lib/Target/X86/X86TargetMachine.cpp
+@@ -154,6 +154,7 @@ class X86PassConfig : public TargetPassConfig {
+   void addIRPasses() override;
+   bool addInstSelector() override;
+   bool addILPOpts() override;
++  void addPreRegAlloc() override;
+   void addPostRegAlloc() override;
+   void addPreEmitPass() override;
+ };
+@@ -187,6 +188,10 @@ bool X86PassConfig::addILPOpts() {
+   return true;
+ }
+ 
++void X86PassConfig::addPreRegAlloc() {
++  addPass(createX86CallFrameOptimization());
++}
++
+ void X86PassConfig::addPostRegAlloc() {
+   addPass(createX86FloatingPointStackifierPass());
+ }
+Index: test/CodeGen/X86/inalloca-invoke.ll
+===================================================================
+--- test/CodeGen/X86/inalloca-invoke.ll
++++ test/CodeGen/X86/inalloca-invoke.ll
+@@ -31,7 +31,7 @@ blah:
+           to label %invoke.cont unwind label %lpad
+ 
+ ;  Uses end as sret param.

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201502022036.t12KaHbD075043>