From d143023557a117d4cad4b4785ac3e4bb36264e3e Mon Sep 17 00:00:00 2001 From: Andre Masella Date: Tue, 5 Apr 2022 15:22:21 -0400 Subject: [PATCH] Update to LLVM 12-14 Modify llvmlite to support LLVM 11-14 and modify conda recipe to build LLVM14. Also lift over all patches to LLVM versions as required. --- ...-Limit-size-of-non-GlobalValue-name.patch} | 0 ...tch => llvm11-consecutive_registers.patch} | 0 ...-entrypoints-in-add-TLI-mappings.ll.patch} | 0 ...atch => llvm11-intel-D47188-svml-VF.patch} | 0 ...o-static.patch => llvm11-lto-static.patch} | 0 ...ing.patch => llvm11-partial-testing.patch} | 0 ...t-Limit-size-of-non-GlobalValue-name.patch | 49 + .../llvm12-consecutive_registers.patch | 181 ++ conda-recipes/llvm12-lto-static.patch | 12 + conda-recipes/llvm13-lto-static.patch | 12 + .../llvm14-remove-use-of-clonefile.patch | 54 + conda-recipes/llvm14-svml.patch | 2192 +++++++++++++++++ conda-recipes/llvmdev/bld.bat | 35 +- conda-recipes/llvmdev/build.sh | 18 +- conda-recipes/llvmdev/meta.yaml | 31 +- conda-recipes/llvmlite/bld.bat | 5 +- conda-recipes/llvmlite/meta.yaml | 10 +- ffi/Makefile.freebsd | 2 +- ffi/Makefile.osx | 4 +- ffi/build.py | 15 +- ffi/passmanagers.cpp | 9 +- ffi/targets.cpp | 8 + ffi/value.cpp | 13 +- llvmlite/binding/passmanagers.py | 3 +- llvmlite/tests/test_binding.py | 2 +- 25 files changed, 2583 insertions(+), 72 deletions(-) rename conda-recipes/{0001-Revert-Limit-size-of-non-GlobalValue-name.patch => llvm11-0001-Revert-Limit-size-of-non-GlobalValue-name.patch} (100%) rename conda-recipes/{llvm_11_consecutive_registers.patch => llvm11-consecutive_registers.patch} (100%) rename conda-recipes/{expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch => llvm11-expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch} (100%) rename conda-recipes/{intel-D47188-svml-VF.patch => llvm11-intel-D47188-svml-VF.patch} (100%) rename conda-recipes/{llvm-lto-static.patch => llvm11-lto-static.patch} (100%) rename conda-recipes/{partial-testing.patch => llvm11-partial-testing.patch} (100%) create mode 100644 conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch create mode 100644 conda-recipes/llvm12-consecutive_registers.patch create mode 100644 conda-recipes/llvm12-lto-static.patch create mode 100644 conda-recipes/llvm13-lto-static.patch create mode 100644 conda-recipes/llvm14-remove-use-of-clonefile.patch create mode 100644 conda-recipes/llvm14-svml.patch diff --git a/conda-recipes/0001-Revert-Limit-size-of-non-GlobalValue-name.patch b/conda-recipes/llvm11-0001-Revert-Limit-size-of-non-GlobalValue-name.patch similarity index 100% rename from conda-recipes/0001-Revert-Limit-size-of-non-GlobalValue-name.patch rename to conda-recipes/llvm11-0001-Revert-Limit-size-of-non-GlobalValue-name.patch diff --git a/conda-recipes/llvm_11_consecutive_registers.patch b/conda-recipes/llvm11-consecutive_registers.patch similarity index 100% rename from conda-recipes/llvm_11_consecutive_registers.patch rename to conda-recipes/llvm11-consecutive_registers.patch diff --git a/conda-recipes/expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch b/conda-recipes/llvm11-expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch similarity index 100% rename from conda-recipes/expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch rename to conda-recipes/llvm11-expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch diff --git a/conda-recipes/intel-D47188-svml-VF.patch b/conda-recipes/llvm11-intel-D47188-svml-VF.patch similarity index 100% rename from conda-recipes/intel-D47188-svml-VF.patch rename to conda-recipes/llvm11-intel-D47188-svml-VF.patch diff --git a/conda-recipes/llvm-lto-static.patch b/conda-recipes/llvm11-lto-static.patch similarity index 100% rename from conda-recipes/llvm-lto-static.patch rename to conda-recipes/llvm11-lto-static.patch diff --git a/conda-recipes/partial-testing.patch b/conda-recipes/llvm11-partial-testing.patch similarity index 100% rename from conda-recipes/partial-testing.patch rename to conda-recipes/llvm11-partial-testing.patch diff --git a/conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch b/conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch new file mode 100644 index 000000000..9b722d36c --- /dev/null +++ b/conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch @@ -0,0 +1,49 @@ +diff -ur a/lib/IR/Value.cpp b/lib/IR/Value.cpp +--- a/lib/IR/Value.cpp 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/IR/Value.cpp 2022-03-31 15:39:31.000000000 -0400 +@@ -38,10 +38,6 @@ + + using namespace llvm; + +-static cl::opt NonGlobalValueMaxNameSize( +- "non-global-value-max-name-size", cl::Hidden, cl::init(1024), +- cl::desc("Maximum size for the name of non-global values.")); +- + //===----------------------------------------------------------------------===// + // Value Class + //===----------------------------------------------------------------------===// +@@ -319,11 +315,6 @@ + if (getName() == NameRef) + return; + +- // Cap the size of non-GlobalValue names. +- if (NameRef.size() > NonGlobalValueMaxNameSize && !isa(this)) +- NameRef = +- NameRef.substr(0, std::max(1u, (unsigned)NonGlobalValueMaxNameSize)); +- + assert(!getType()->isVoidTy() && "Cannot assign a name to void values!"); + + // Get the symbol table to update for this object. +diff -ur a/test/Bitcode/value-with-long-name.ll b/test/Bitcode/value-with-long-name.ll +deleted file mode 1000644 +--- a/test/Bitcode/value-with-long-name.ll ++++ /dev/null +@@ -1,18 +0,0 @@ +-; Check the size of generated variable when no option is set +-; RUN: opt -S %s -O2 -o - | FileCheck -check-prefix=CHECK-LONG %s +-; CHECK-LONG: %{{[a-z]{4}[a-z]+}} +- +-; Then check we correctly cap the size of newly generated non-global values name +-; Force the size to be small so that the check works on release and debug build +-; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=0 | FileCheck -check-prefix=CHECK-SHORT %s +-; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=1 | FileCheck -check-prefix=CHECK-SHORT %s +-; CHECK-SHORT-NOT: %{{[a-z][a-z]+}} +- +-define i32 @f(i32 %a, i32 %b) { +- %c = add i32 %a, %b +- %d = add i32 %c, %a +- %e = add i32 %d, %b +- ret i32 %e +-} +- +- diff --git a/conda-recipes/llvm12-consecutive_registers.patch b/conda-recipes/llvm12-consecutive_registers.patch new file mode 100644 index 000000000..cc60217bd --- /dev/null +++ b/conda-recipes/llvm12-consecutive_registers.patch @@ -0,0 +1,181 @@ +diff -ur a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h +--- a/include/llvm/CodeGen/TargetLowering.h 2021-04-06 12:38:18.000000000 -0400 ++++ b/include/llvm/CodeGen/TargetLowering.h 2022-03-31 15:52:45.000000000 -0400 +@@ -3975,7 +3975,8 @@ + /// must be passed in a block of consecutive registers. + virtual bool + functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, +- bool isVarArg) const { ++ bool isVarArg, ++ const DataLayout &DL) const { + return false; + } + +diff -ur a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp +--- a/lib/CodeGen/SelectionDAG/FastISel.cpp 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/CodeGen/SelectionDAG/FastISel.cpp 2022-03-31 15:52:45.000000000 -0400 +@@ -1087,7 +1087,7 @@ + if (Arg.IsByVal) + FinalType = cast(Arg.Ty)->getElementType(); + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( +- FinalType, CLI.CallConv, CLI.IsVarArg); ++ FinalType, CLI.CallConv, CLI.IsVarArg, DL); + + ISD::ArgFlagsTy Flags; + if (Arg.IsZExt) +diff -ur a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp 2022-03-31 15:52:45.000000000 -0400 +@@ -1851,7 +1851,7 @@ + + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( + I.getOperand(0)->getType(), F->getCallingConv(), +- /*IsVarArg*/ false); ++ /*IsVarArg*/ false, DL); + + ISD::NodeType ExtendKind = ISD::ANY_EXTEND; + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, +@@ -9229,7 +9229,7 @@ + CLI.IsTailCall = false; + } else { + bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( +- CLI.RetTy, CLI.CallConv, CLI.IsVarArg); ++ CLI.RetTy, CLI.CallConv, CLI.IsVarArg, DL); + for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { + ISD::ArgFlagsTy Flags; + if (NeedsRegBlock) { +@@ -9289,7 +9289,7 @@ + if (Args[i].IsByVal) + FinalType = cast(Args[i].Ty)->getElementType(); + bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( +- FinalType, CLI.CallConv, CLI.IsVarArg); ++ FinalType, CLI.CallConv, CLI.IsVarArg, DL); + for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; + ++Value) { + EVT VT = ValueVTs[Value]; +@@ -9830,7 +9830,7 @@ + if (Arg.hasAttribute(Attribute::ByVal)) + FinalType = Arg.getParamByValType(); + bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( +- FinalType, F.getCallingConv(), F.isVarArg()); ++ FinalType, F.getCallingConv(), F.isVarArg(), DL); + for (unsigned Value = 0, NumValues = ValueVTs.size(); + Value != NumValues; ++Value) { + EVT VT = ValueVTs[Value]; +diff -ur a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp +--- a/lib/Target/AArch64/AArch64ISelLowering.cpp 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/Target/AArch64/AArch64ISelLowering.cpp 2022-03-31 15:52:45.000000000 -0400 +@@ -30,6 +30,7 @@ + #include "llvm/ADT/Triple.h" + #include "llvm/ADT/Twine.h" + #include "llvm/Analysis/VectorUtils.h" ++#include "llvm/CodeGen/Analysis.h" + #include "llvm/CodeGen/CallingConvLower.h" + #include "llvm/CodeGen/MachineBasicBlock.h" + #include "llvm/CodeGen/MachineFrameInfo.h" +@@ -16455,15 +16456,17 @@ + } + + bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( +- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { +- if (Ty->isArrayTy()) +- return true; +- +- const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); +- if (TySize.isScalable() && TySize.getKnownMinSize() > 128) +- return true; ++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, ++ const DataLayout &DL) const { ++ if (!Ty->isArrayTy()) { ++ const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); ++ return TySize.isScalable() && TySize.getKnownMinSize() > 128; ++ } + +- return false; ++ // All non aggregate members of the type must have the same type ++ SmallVector ValueVTs; ++ ComputeValueVTs(*this, DL, Ty, ValueVTs); ++ return is_splat(ValueVTs); + } + + bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, +diff -ur a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h +--- a/lib/Target/AArch64/AArch64ISelLowering.h 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/Target/AArch64/AArch64ISelLowering.h 2022-03-31 15:52:45.000000000 -0400 +@@ -770,9 +770,10 @@ + MachineMemOperand::Flags getTargetMMOFlags( + const Instruction &I) const override; + +- bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, +- CallingConv::ID CallConv, +- bool isVarArg) const override; ++ bool functionArgumentNeedsConsecutiveRegisters( ++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, ++ const DataLayout &DL) const override; ++ + /// Used for exception handling on Win64. + bool needsFixedCatchObjects() const override; + +diff -ur a/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +--- a/lib/Target/AArch64/GISel/AArch64CallLowering.cpp 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/Target/AArch64/GISel/AArch64CallLowering.cpp 2022-03-31 15:52:45.000000000 -0400 +@@ -259,7 +259,7 @@ + assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); + + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( +- OrigArg.Ty, CallConv, false); ++ OrigArg.Ty, CallConv, false, DL); + for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { + Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); + SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], +diff -ur a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp +--- a/lib/Target/ARM/ARMCallLowering.cpp 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/Target/ARM/ARMCallLowering.cpp 2022-03-31 15:52:45.000000000 -0400 +@@ -218,7 +218,7 @@ + + bool NeedsConsecutiveRegisters = + TLI.functionArgumentNeedsConsecutiveRegisters( +- SplitTy, F.getCallingConv(), F.isVarArg()); ++ SplitTy, F.getCallingConv(), F.isVarArg(), DL); + if (NeedsConsecutiveRegisters) { + Flags.setInConsecutiveRegs(); + if (i == e - 1) +diff -ur a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp +--- a/lib/Target/ARM/ARMISelLowering.cpp 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/Target/ARM/ARMISelLowering.cpp 2022-03-31 15:52:45.000000000 -0400 +@@ -19269,7 +19269,8 @@ + /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when + /// passing according to AAPCS rules. + bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( +- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { ++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, ++ const DataLayout &DL) const { + if (getEffectiveCallingConv(CallConv, isVarArg) != + CallingConv::ARM_AAPCS_VFP) + return false; +diff -ur a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h +--- a/lib/Target/ARM/ARMISelLowering.h 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/Target/ARM/ARMISelLowering.h 2022-03-31 15:52:45.000000000 -0400 +@@ -578,7 +578,8 @@ + /// Returns true if an argument of type Ty needs to be passed in a + /// contiguous block of registers in calling convention CallConv. + bool functionArgumentNeedsConsecutiveRegisters( +- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; ++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, ++ const DataLayout &DL) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. +diff -ur a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h +--- a/lib/Target/PowerPC/PPCISelLowering.h 2021-04-06 12:38:18.000000000 -0400 ++++ b/lib/Target/PowerPC/PPCISelLowering.h 2022-03-31 15:52:45.000000000 -0400 +@@ -998,7 +998,8 @@ + /// Returns true if an argument of type Ty needs to be passed in a + /// contiguous block of registers in calling convention CallConv. + bool functionArgumentNeedsConsecutiveRegisters( +- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override { ++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, ++ const DataLayout &DL) const override { + // We support any array type as "consecutive" block in the parameter + // save area. The element type defines the alignment requirement and + // whether the argument should go in GPRs, FPRs, or VRs if available. diff --git a/conda-recipes/llvm12-lto-static.patch b/conda-recipes/llvm12-lto-static.patch new file mode 100644 index 000000000..76cc55def --- /dev/null +++ b/conda-recipes/llvm12-lto-static.patch @@ -0,0 +1,12 @@ +diff -ur a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt +--- llvm-12.0.0.src-orig/tools/lto/CMakeLists.txt 2021-04-06 12:38:18.000000000 -0400 ++++ llvm-12.0.0.src/tools/lto/CMakeLists.txt 2022-03-31 15:46:00.000000000 -0400 +@@ -21,7 +21,7 @@ + + set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/lto.exports) + +-add_llvm_library(LTO SHARED INSTALL_WITH_TOOLCHAIN ${SOURCES} DEPENDS ++add_llvm_library(LTO INSTALL_WITH_TOOLCHAIN ${SOURCES} DEPENDS + intrinsics_gen) + + install(FILES ${LLVM_MAIN_INCLUDE_DIR}/llvm-c/lto.h diff --git a/conda-recipes/llvm13-lto-static.patch b/conda-recipes/llvm13-lto-static.patch new file mode 100644 index 000000000..b8a624250 --- /dev/null +++ b/conda-recipes/llvm13-lto-static.patch @@ -0,0 +1,12 @@ +diff -ur llvm-13.0.0.src-orig/tools/lto/CMakeLists.txt llvm-13.0.0.src/tools/lto/CMakeLists.txt +--- llvm-13.0.0.src-orig/tools/lto/CMakeLists.txt 2021-09-24 12:18:10.000000000 -0400 ++++ llvm-13.0.0.src/tools/lto/CMakeLists.txt 2022-03-31 17:07:07.000000000 -0400 +@@ -25,7 +25,7 @@ + set(LTO_LIBRARY_TYPE MODULE) + set(LTO_LIBRARY_NAME libLTO) + else() +- set(LTO_LIBRARY_TYPE SHARED) ++ set(LTO_LIBRARY_TYPE STATIC) + set(LTO_LIBRARY_NAME LTO) + endif() + diff --git a/conda-recipes/llvm14-remove-use-of-clonefile.patch b/conda-recipes/llvm14-remove-use-of-clonefile.patch new file mode 100644 index 000000000..6ef9c9d61 --- /dev/null +++ b/conda-recipes/llvm14-remove-use-of-clonefile.patch @@ -0,0 +1,54 @@ +diff -ur a/llvm-14.0.6.src/lib/Support/Unix/Path.inc b/llvm-14.0.6.src/lib/Support/Unix/Path.inc +--- a/llvm-14.0.6.src/lib/Support/Unix/Path.inc 2022-03-14 05:44:55.000000000 -0400 ++++ b/llvm-14.0.6.src/lib/Support/Unix/Path.inc 2022-09-19 11:30:59.000000000 -0400 +@@ -1462,6 +1462,7 @@ + std::error_code copy_file(const Twine &From, const Twine &To) { + std::string FromS = From.str(); + std::string ToS = To.str(); ++ /* + #if __has_builtin(__builtin_available) + if (__builtin_available(macos 10.12, *)) { + // Optimistically try to use clonefile() and handle errors, rather than +@@ -1490,6 +1491,7 @@ + // cheaper. + } + #endif ++ */ + if (!copyfile(FromS.c_str(), ToS.c_str(), /*State=*/NULL, COPYFILE_DATA)) + return std::error_code(); + return std::error_code(errno, std::generic_category()); +diff -ur a/llvm-14.0.6.src/unittests/Support/Path.cpp b/llvm-14.0.6.src/unittests/Support/Path.cpp +--- a/llvm-14.0.6.src/unittests/Support/Path.cpp 2022-03-14 05:44:55.000000000 -0400 ++++ b/llvm-14.0.6.src/unittests/Support/Path.cpp 2022-09-19 11:33:07.000000000 -0400 +@@ -2267,15 +2267,15 @@ + + EXPECT_EQ(fs::setPermissions(TempPath, fs::set_uid_on_exe), NoError); + EXPECT_TRUE(CheckPermissions(fs::set_uid_on_exe)); +- ++#if !defined(__APPLE__) + EXPECT_EQ(fs::setPermissions(TempPath, fs::set_gid_on_exe), NoError); + EXPECT_TRUE(CheckPermissions(fs::set_gid_on_exe)); +- ++#endif + // Modern BSDs require root to set the sticky bit on files. + // AIX and Solaris without root will mask off (i.e., lose) the sticky bit + // on files. + #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && \ +- !defined(_AIX) && !(defined(__sun__) && defined(__svr4__)) ++ !defined(_AIX) && !(defined(__sun__) && defined(__svr4__)) && !defined(__APPLE__) + EXPECT_EQ(fs::setPermissions(TempPath, fs::sticky_bit), NoError); + EXPECT_TRUE(CheckPermissions(fs::sticky_bit)); + +@@ -2297,10 +2297,12 @@ + EXPECT_TRUE(CheckPermissions(fs::all_perms)); + #endif // !FreeBSD && !NetBSD && !OpenBSD && !AIX + ++#if !defined(__APPLE__) + EXPECT_EQ(fs::setPermissions(TempPath, fs::all_perms & ~fs::sticky_bit), + NoError); + EXPECT_TRUE(CheckPermissions(fs::all_perms & ~fs::sticky_bit)); + #endif ++#endif + } + + #ifdef _WIN32 diff --git a/conda-recipes/llvm14-svml.patch b/conda-recipes/llvm14-svml.patch new file mode 100644 index 000000000..cdce26b34 --- /dev/null +++ b/conda-recipes/llvm14-svml.patch @@ -0,0 +1,2192 @@ +From bc2dcd190b7148d04772fa7fcd18b5200b758d4a Mon Sep 17 00:00:00 2001 +From: Ivan Butygin +Date: Sun, 24 Jul 2022 20:31:29 +0200 +Subject: [PATCH] Fixes vectorizer and extends SVML support + +Patch was updated to fix SVML calling convention issues uncovered by llvm 10. +In previous versions of patch SVML calling convention was selected based on +compilation settings. So if you try to call 256bit vector function from avx512 +code function will be called with avx512 cc which is incorrect. To fix this +SVML cc was separated into 3 different cc for 128, 256 and 512bit vector lengths +which are selected based on actual input vector length. + +Original patch merged several fixes: + +1. https://reviews.llvm.org/D47188 patch fixes the problem with improper calls +to SVML library as it has non-standard calling conventions. So accordingly it +has SVML calling conventions definitions and code to set CC to the vectorized +calls. As SVML provides several implementations for the math functions we also +took into consideration fast attribute and select more fast implementation in +such case. This work is based on original Matt Masten's work. +Author: Denis Nagorny + +2. https://reviews.llvm.org/D53035 patch implements support to legalize SVML +calls by breaking down the illegal vector call instruction into multiple legal +vector call instructions during code generation. Currently the vectorizer does +not check legality of the generated SVML (or any VECLIB) call instructions, and +this can lead to potential problems even during vector type legalization. This +patch addresses this issue by adding a legality check during code generation and +replaces the illegal SVML call with corresponding legalized instructions. +(RFC: http://lists.llvm.org/pipermail/llvm-dev/2018-June/124357.html) +Author: Karthik Senthil +--- + .../include/llvm/Analysis/TargetLibraryInfo.h | 22 +- + llvm/include/llvm/AsmParser/LLToken.h | 3 + + llvm/include/llvm/IR/CMakeLists.txt | 4 + + llvm/include/llvm/IR/CallingConv.h | 5 + + llvm/include/llvm/IR/SVML.td | 62 +++ + llvm/lib/Analysis/CMakeLists.txt | 1 + + llvm/lib/Analysis/TargetLibraryInfo.cpp | 55 +- + llvm/lib/AsmParser/LLLexer.cpp | 3 + + llvm/lib/AsmParser/LLParser.cpp | 6 + + llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 2 +- + llvm/lib/IR/AsmWriter.cpp | 3 + + llvm/lib/IR/Verifier.cpp | 3 + + llvm/lib/Target/X86/X86CallingConv.td | 70 +++ + llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +- + llvm/lib/Target/X86/X86RegisterInfo.cpp | 46 ++ + llvm/lib/Target/X86/X86Subtarget.h | 3 + + .../Transforms/Utils/InjectTLIMappings.cpp | 2 +- + .../Transforms/Vectorize/LoopVectorize.cpp | 269 +++++++++ + .../Generic/replace-intrinsics-with-veclib.ll | 4 +- + .../LoopVectorize/X86/svml-calls-finite.ll | 24 +- + .../LoopVectorize/X86/svml-calls.ll | 108 ++-- + .../LoopVectorize/X86/svml-legal-calls.ll | 513 ++++++++++++++++++ + .../LoopVectorize/X86/svml-legal-codegen.ll | 61 +++ + llvm/test/Transforms/Util/add-TLI-mappings.ll | 18 +- + llvm/utils/TableGen/CMakeLists.txt | 1 + + llvm/utils/TableGen/SVMLEmitter.cpp | 110 ++++ + llvm/utils/TableGen/TableGen.cpp | 8 +- + llvm/utils/TableGen/TableGenBackends.h | 1 + + llvm/utils/vim/syntax/llvm.vim | 1 + + 29 files changed, 1341 insertions(+), 70 deletions(-) + create mode 100644 llvm/include/llvm/IR/SVML.td + create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll + create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll + create mode 100644 llvm/utils/TableGen/SVMLEmitter.cpp + +diff --git a/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h b/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h +index 17d1e3f770c14..110ff08189867 100644 +--- a/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h ++++ b/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h +@@ -39,6 +39,12 @@ struct VecDesc { + NotLibFunc + }; + ++enum SVMLAccuracy { ++ SVML_DEFAULT, ++ SVML_HA, ++ SVML_EP ++}; ++ + /// Implementation of the target library information. + /// + /// This class constructs tables that hold the target library information and +@@ -157,7 +163,7 @@ class TargetLibraryInfoImpl { + /// Return true if the function F has a vector equivalent with vectorization + /// factor VF. + bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const { +- return !getVectorizedFunction(F, VF).empty(); ++ return !getVectorizedFunction(F, VF, false).empty(); + } + + /// Return true if the function F has a vector equivalent with any +@@ -166,7 +172,10 @@ class TargetLibraryInfoImpl { + + /// Return the name of the equivalent of F, vectorized with factor VF. If no + /// such mapping exists, return the empty string. +- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const; ++ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const; ++ ++ Optional getVectorizedFunctionCallingConv( ++ StringRef F, const FunctionType &FTy, const DataLayout &DL) const; + + /// Set to true iff i32 parameters to library functions should have signext + /// or zeroext attributes if they correspond to C-level int or unsigned int, +@@ -326,8 +335,13 @@ class TargetLibraryInfo { + bool isFunctionVectorizable(StringRef F) const { + return Impl->isFunctionVectorizable(F); + } +- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const { +- return Impl->getVectorizedFunction(F, VF); ++ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const { ++ return Impl->getVectorizedFunction(F, VF, IsFast); ++ } ++ ++ Optional getVectorizedFunctionCallingConv( ++ StringRef F, const FunctionType &FTy, const DataLayout &DL) const { ++ return Impl->getVectorizedFunctionCallingConv(F, FTy, DL); + } + + /// Tests if the function is both available and a candidate for optimized code +diff --git a/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h b/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h +index 78ebb35e0ea4d..3ffb57db8b18b 100644 +--- a/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h ++++ b/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h +@@ -133,6 +133,9 @@ enum Kind { + kw_fastcc, + kw_coldcc, + kw_intel_ocl_bicc, ++ kw_intel_svmlcc128, ++ kw_intel_svmlcc256, ++ kw_intel_svmlcc512, + kw_cfguard_checkcc, + kw_x86_stdcallcc, + kw_x86_fastcallcc, +diff --git a/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt b/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt +index 0498fc269b634..23bb3de41bc1a 100644 +--- a/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt ++++ b/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt +@@ -20,3 +20,7 @@ tablegen(LLVM IntrinsicsX86.h -gen-intrinsic-enums -intrinsic-prefix=x86) + tablegen(LLVM IntrinsicsXCore.h -gen-intrinsic-enums -intrinsic-prefix=xcore) + tablegen(LLVM IntrinsicsVE.h -gen-intrinsic-enums -intrinsic-prefix=ve) + add_public_tablegen_target(intrinsics_gen) ++ ++set(LLVM_TARGET_DEFINITIONS SVML.td) ++tablegen(LLVM SVML.inc -gen-svml) ++add_public_tablegen_target(svml_gen) +diff --git a/llvm-14.0.6.src/include/llvm/IR/CallingConv.h b/llvm-14.0.6.src/include/llvm/IR/CallingConv.h +index fd28542465225..096eea1a8e19b 100644 +--- a/llvm-14.0.6.src/include/llvm/IR/CallingConv.h ++++ b/llvm-14.0.6.src/include/llvm/IR/CallingConv.h +@@ -252,6 +252,11 @@ namespace CallingConv { + /// M68k_INTR - Calling convention used for M68k interrupt routines. + M68k_INTR = 101, + ++ /// Intel_SVML - Calling conventions for Intel Short Math Vector Library ++ Intel_SVML128 = 102, ++ Intel_SVML256 = 103, ++ Intel_SVML512 = 104, ++ + /// The highest possible calling convention ID. Must be some 2^k - 1. + MaxID = 1023 + }; +diff --git a/llvm-14.0.6.src/include/llvm/IR/SVML.td b/llvm-14.0.6.src/include/llvm/IR/SVML.td +new file mode 100644 +index 0000000000000..5af710404c9d9 +--- /dev/null ++++ b/llvm-14.0.6.src/include/llvm/IR/SVML.td +@@ -0,0 +1,62 @@ ++//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This file is used by TableGen to define the different typs of SVML function ++// variants used with -fveclib=SVML. ++// ++//===----------------------------------------------------------------------===// ++ ++class SvmlVariant; ++ ++def sin : SvmlVariant; ++def cos : SvmlVariant; ++def pow : SvmlVariant; ++def exp : SvmlVariant; ++def log : SvmlVariant; ++def acos : SvmlVariant; ++def acosh : SvmlVariant; ++def asin : SvmlVariant; ++def asinh : SvmlVariant; ++def atan2 : SvmlVariant; ++def atan : SvmlVariant; ++def atanh : SvmlVariant; ++def cbrt : SvmlVariant; ++def cdfnorm : SvmlVariant; ++def cdfnorminv : SvmlVariant; ++def cosd : SvmlVariant; ++def cosh : SvmlVariant; ++def erf : SvmlVariant; ++def erfc : SvmlVariant; ++def erfcinv : SvmlVariant; ++def erfinv : SvmlVariant; ++def exp10 : SvmlVariant; ++def exp2 : SvmlVariant; ++def expm1 : SvmlVariant; ++def hypot : SvmlVariant; ++def invsqrt : SvmlVariant; ++def log10 : SvmlVariant; ++def log1p : SvmlVariant; ++def log2 : SvmlVariant; ++def sind : SvmlVariant; ++def sinh : SvmlVariant; ++def sqrt : SvmlVariant; ++def tan : SvmlVariant; ++def tanh : SvmlVariant; ++ ++// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions. ++// We should call the default variant of these functions in all cases instead. ++ ++// def nearbyint : SvmlVariant; ++// def logb : SvmlVariant; ++// def floor : SvmlVariant; ++// def fmod : SvmlVariant; ++// def ceil : SvmlVariant; ++// def trunc : SvmlVariant; ++// def rint : SvmlVariant; ++// def round : SvmlVariant; +diff --git a/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt b/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt +index aec84124129f4..98286e166fbe2 100644 +--- a/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt ++++ b/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt +@@ -150,6 +150,7 @@ add_llvm_component_library(LLVMAnalysis + DEPENDS + intrinsics_gen + ${MLDeps} ++ svml_gen + + LINK_LIBS + ${MLLinkDeps} +diff --git a/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp b/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp +index 02923c2c7eb14..83abde28a62a4 100644 +--- a/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp ++++ b/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp +@@ -110,6 +110,11 @@ bool TargetLibraryInfoImpl::isCallingConvCCompatible(Function *F) { + F->getFunctionType()); + } + ++static std::string svmlMangle(StringRef FnName, const bool IsFast) { ++ std::string FullName = FnName.str(); ++ return IsFast ? FullName : FullName + "_ha"; ++} ++ + /// Initialize the set of available library functions based on the specified + /// target triple. This should be carefully written so that a missing target + /// triple gets a sane set of defaults. +@@ -1876,8 +1881,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( + } + case SVML: { + const VecDesc VecFuncs[] = { +- #define TLI_DEFINE_SVML_VECFUNCS +- #include "llvm/Analysis/VecFuncs.def" ++ #define GET_SVML_VARIANTS ++ #include "llvm/IR/SVML.inc" ++ #undef GET_SVML_VARIANTS + }; + addVectorizableFunctions(VecFuncs); + break; +@@ -1897,20 +1903,51 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const { + return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName; + } + +-StringRef +-TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, +- const ElementCount &VF) const { ++std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, ++ const ElementCount &VF, ++ bool IsFast) const { ++ bool FromSVML = ClVectorLibrary == SVML; + F = sanitizeFunctionName(F); + if (F.empty()) +- return F; ++ return F.str(); + std::vector::const_iterator I = + llvm::lower_bound(VectorDescs, F, compareWithScalarFnName); + while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) { +- if (I->VectorizationFactor == VF) +- return I->VectorFnName; ++ if (I->VectorizationFactor == VF) { ++ if (FromSVML) { ++ return svmlMangle(I->VectorFnName, IsFast); ++ } ++ return I->VectorFnName.str(); ++ } + ++I; + } +- return StringRef(); ++ return std::string(); ++} ++ ++static CallingConv::ID getSVMLCallingConv(const DataLayout &DL, const FunctionType &FType) ++{ ++ assert(isa(FType.getReturnType())); ++ auto *VecCallRetType = cast(FType.getReturnType()); ++ auto TypeBitWidth = DL.getTypeSizeInBits(VecCallRetType); ++ if (TypeBitWidth == 128) { ++ return CallingConv::Intel_SVML128; ++ } else if (TypeBitWidth == 256) { ++ return CallingConv::Intel_SVML256; ++ } else if (TypeBitWidth == 512) { ++ return CallingConv::Intel_SVML512; ++ } else { ++ llvm_unreachable("Invalid vector width"); ++ } ++ return 0; // not reachable ++} ++ ++Optional ++TargetLibraryInfoImpl::getVectorizedFunctionCallingConv( ++ StringRef F, const FunctionType &FTy, const DataLayout &DL) const { ++ if (F.startswith("__svml")) { ++ return getSVMLCallingConv(DL, FTy); ++ } ++ return {}; + } + + TargetLibraryInfo TargetLibraryAnalysis::run(const Function &F, +diff --git a/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp b/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp +index e3bf41c9721b6..4f9dccd4e0724 100644 +--- a/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp ++++ b/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp +@@ -603,6 +603,9 @@ lltok::Kind LLLexer::LexIdentifier() { + KEYWORD(spir_kernel); + KEYWORD(spir_func); + KEYWORD(intel_ocl_bicc); ++ KEYWORD(intel_svmlcc128); ++ KEYWORD(intel_svmlcc256); ++ KEYWORD(intel_svmlcc512); + KEYWORD(x86_64_sysvcc); + KEYWORD(win64cc); + KEYWORD(x86_regcallcc); +diff --git a/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp b/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp +index 432ec151cf8ae..3bd6ee61024b8 100644 +--- a/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp ++++ b/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp +@@ -1781,6 +1781,9 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) { + /// ::= 'ccc' + /// ::= 'fastcc' + /// ::= 'intel_ocl_bicc' ++/// ::= 'intel_svmlcc128' ++/// ::= 'intel_svmlcc256' ++/// ::= 'intel_svmlcc512' + /// ::= 'coldcc' + /// ::= 'cfguard_checkcc' + /// ::= 'x86_stdcallcc' +@@ -1850,6 +1853,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) { + case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; + case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; + case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; ++ case lltok::kw_intel_svmlcc128:CC = CallingConv::Intel_SVML128; break; ++ case lltok::kw_intel_svmlcc256:CC = CallingConv::Intel_SVML256; break; ++ case lltok::kw_intel_svmlcc512:CC = CallingConv::Intel_SVML512; break; + case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; + case lltok::kw_win64cc: CC = CallingConv::Win64; break; + case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; +diff --git a/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp +index 0ff045fa787e8..175651949ef85 100644 +--- a/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp ++++ b/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp +@@ -157,7 +157,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, + // and the exact vector width of the call operands in the + // TargetLibraryInfo. + const std::string TLIName = +- std::string(TLI.getVectorizedFunction(ScalarName, VF)); ++ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast())); + + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" + << ScalarName << "` and vector width " << VF << ".\n"); +diff --git a/llvm-14.0.6.src/lib/IR/AsmWriter.cpp b/llvm-14.0.6.src/lib/IR/AsmWriter.cpp +index 179754e275b03..c4e95752c97e8 100644 +--- a/llvm-14.0.6.src/lib/IR/AsmWriter.cpp ++++ b/llvm-14.0.6.src/lib/IR/AsmWriter.cpp +@@ -306,6 +306,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { + case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break; + case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; + case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; ++ case CallingConv::Intel_SVML128: Out << "intel_svmlcc128"; break; ++ case CallingConv::Intel_SVML256: Out << "intel_svmlcc256"; break; ++ case CallingConv::Intel_SVML512: Out << "intel_svmlcc512"; break; + case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; + case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; + case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break; +diff --git a/llvm-14.0.6.src/lib/IR/Verifier.cpp b/llvm-14.0.6.src/lib/IR/Verifier.cpp +index 989d01e2e3950..bae7382a36e13 100644 +--- a/llvm-14.0.6.src/lib/IR/Verifier.cpp ++++ b/llvm-14.0.6.src/lib/IR/Verifier.cpp +@@ -2457,6 +2457,9 @@ void Verifier::visitFunction(const Function &F) { + case CallingConv::Fast: + case CallingConv::Cold: + case CallingConv::Intel_OCL_BI: ++ case CallingConv::Intel_SVML128: ++ case CallingConv::Intel_SVML256: ++ case CallingConv::Intel_SVML512: + case CallingConv::PTX_Kernel: + case CallingConv::PTX_Device: + Assert(!F.isVarArg(), "Calling convention does not support varargs or " +diff --git a/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td b/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td +index 4dd8a6cdd8982..12e65521215e4 100644 +--- a/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td ++++ b/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td +@@ -498,6 +498,21 @@ def RetCC_X86_64 : CallingConv<[ + CCDelegateTo + ]>; + ++// Intel_SVML return-value convention. ++def RetCC_Intel_SVML : CallingConv<[ ++ // Vector types are returned in XMM0,XMM1 ++ CCIfType<[v4f32, v2f64], ++ CCAssignToReg<[XMM0,XMM1]>>, ++ ++ // 256-bit FP vectors ++ CCIfType<[v8f32, v4f64], ++ CCAssignToReg<[YMM0,YMM1]>>, ++ ++ // 512-bit FP vectors ++ CCIfType<[v16f32, v8f64], ++ CCAssignToReg<[ZMM0,ZMM1]>> ++]>; ++ + // This is the return-value convention used for the entire X86 backend. + let Entry = 1 in + def RetCC_X86 : CallingConv<[ +@@ -505,6 +520,10 @@ def RetCC_X86 : CallingConv<[ + // Check if this is the Intel OpenCL built-ins calling convention + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, + ++ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo>, ++ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo>, ++ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo>, ++ + CCIfSubtarget<"is64Bit()", CCDelegateTo>, + CCDelegateTo + ]>; +@@ -1064,6 +1083,30 @@ def CC_Intel_OCL_BI : CallingConv<[ + CCDelegateTo + ]>; + ++// X86-64 Intel Short Vector Math Library calling convention. ++def CC_Intel_SVML : CallingConv<[ ++ ++ // The SSE vector arguments are passed in XMM registers. ++ CCIfType<[v4f32, v2f64], ++ CCAssignToReg<[XMM0, XMM1, XMM2]>>, ++ ++ // The 256-bit vector arguments are passed in YMM registers. ++ CCIfType<[v8f32, v4f64], ++ CCAssignToReg<[YMM0, YMM1, YMM2]>>, ++ ++ // The 512-bit vector arguments are passed in ZMM registers. ++ CCIfType<[v16f32, v8f64], ++ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>> ++]>; ++ ++def CC_X86_32_Intr : CallingConv<[ ++ CCAssignToStack<4, 4> ++]>; ++ ++def CC_X86_64_Intr : CallingConv<[ ++ CCAssignToStack<8, 8> ++]>; ++ + //===----------------------------------------------------------------------===// + // X86 Root Argument Calling Conventions + //===----------------------------------------------------------------------===// +@@ -1115,6 +1158,9 @@ def CC_X86_64 : CallingConv<[ + let Entry = 1 in + def CC_X86 : CallingConv<[ + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, ++ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo>, ++ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo>, ++ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo>, + CCIfSubtarget<"is64Bit()", CCDelegateTo>, + CCDelegateTo + ]>; +@@ -1227,3 +1273,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, + (sequence "R%u", 12, 15))>; + def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, + (sequence "XMM%u", 8, 15))>; ++ ++// SVML calling convention ++def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>; ++def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML, ++ K4, K5, K6, K7)>; ++ ++def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>; ++ ++def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "XMM%u", 8, 15))>; ++def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "XMM%u", 6, 15))>; ++ ++def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "YMM%u", 8, 15))>; ++def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "YMM%u", 6, 15))>; ++ ++def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "ZMM%u", 16, 31), ++ K4, K5, K6, K7)>; ++def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, ++ (sequence "ZMM%u", 6, 21), ++ K4, K5, K6, K7)>; +diff --git a/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp b/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp +index 8bb7e81e19bbd..1780ce3fc6467 100644 +--- a/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp ++++ b/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp +@@ -3788,7 +3788,8 @@ void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget.useAVX512Regs() && + (is64Bit() || (CallConv == CallingConv::X86_VectorCall || +- CallConv == CallingConv::Intel_OCL_BI))) ++ CallConv == CallingConv::Intel_OCL_BI || ++ CallConv == CallingConv::Intel_SVML512))) + VecVT = MVT::v16f32; + else if (Subtarget.hasAVX()) + VecVT = MVT::v8f32; +diff --git a/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp b/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp +index 130cb61cdde24..9eec3b25ca9f2 100644 +--- a/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp ++++ b/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp +@@ -272,6 +272,42 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + } + } + ++namespace { ++std::pair getSVMLRegMaskAndSaveList( ++ bool Is64Bit, bool IsWin64, CallingConv::ID CC) { ++ assert(CC >= CallingConv::Intel_SVML128 && CC <= CallingConv::Intel_SVML512); ++ unsigned Abi = CC - CallingConv::Intel_SVML128 ; // 0 - 128, 1 - 256, 2 - 512 ++ ++ const std::pair Abi64[] = { ++ std::make_pair(CSR_64_Intel_SVML_RegMask, CSR_64_Intel_SVML_SaveList), ++ std::make_pair(CSR_64_Intel_SVML_AVX_RegMask, CSR_64_Intel_SVML_AVX_SaveList), ++ std::make_pair(CSR_64_Intel_SVML_AVX512_RegMask, CSR_64_Intel_SVML_AVX512_SaveList), ++ }; ++ ++ const std::pair AbiWin64[] = { ++ std::make_pair(CSR_Win64_Intel_SVML_RegMask, CSR_Win64_Intel_SVML_SaveList), ++ std::make_pair(CSR_Win64_Intel_SVML_AVX_RegMask, CSR_Win64_Intel_SVML_AVX_SaveList), ++ std::make_pair(CSR_Win64_Intel_SVML_AVX512_RegMask, CSR_Win64_Intel_SVML_AVX512_SaveList), ++ }; ++ ++ const std::pair Abi32[] = { ++ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList), ++ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList), ++ std::make_pair(CSR_32_Intel_SVML_AVX512_RegMask, CSR_32_Intel_SVML_AVX512_SaveList), ++ }; ++ ++ if (Is64Bit) { ++ if (IsWin64) { ++ return AbiWin64[Abi]; ++ } else { ++ return Abi64[Abi]; ++ } ++ } else { ++ return Abi32[Abi]; ++ } ++} ++} ++ + const MCPhysReg * + X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + assert(MF && "MachineFunction required"); +@@ -327,6 +363,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + return CSR_64_Intel_OCL_BI_SaveList; + break; + } ++ case CallingConv::Intel_SVML128: ++ case CallingConv::Intel_SVML256: ++ case CallingConv::Intel_SVML512: { ++ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).second; ++ } + case CallingConv::HHVM: + return CSR_64_HHVM_SaveList; + case CallingConv::X86_RegCall: +@@ -449,6 +490,11 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + return CSR_64_Intel_OCL_BI_RegMask; + break; + } ++ case CallingConv::Intel_SVML128: ++ case CallingConv::Intel_SVML256: ++ case CallingConv::Intel_SVML512: { ++ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).first; ++ } + case CallingConv::HHVM: + return CSR_64_HHVM_RegMask; + case CallingConv::X86_RegCall: +diff --git a/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h b/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h +index 5d773f0c57dfb..6bdf5bc6f3fe9 100644 +--- a/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h ++++ b/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h +@@ -916,6 +916,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { + case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: + case CallingConv::Intel_OCL_BI: ++ case CallingConv::Intel_SVML128: ++ case CallingConv::Intel_SVML256: ++ case CallingConv::Intel_SVML512: + return isTargetWin64(); + // This convention allows using the Win64 convention on other targets. + case CallingConv::Win64: +diff --git a/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp +index 047bf5569ded3..59897785f156c 100644 +--- a/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp ++++ b/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp +@@ -92,7 +92,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { + + auto AddVariantDecl = [&](const ElementCount &VF) { + const std::string TLIName = +- std::string(TLI.getVectorizedFunction(ScalarName, VF)); ++ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast())); + if (!TLIName.empty()) { + std::string MangledName = + VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF); +diff --git a/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp +index 46ff0994e04e7..f472af5e1a835 100644 +--- a/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp ++++ b/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp +@@ -712,6 +712,27 @@ class InnerLoopVectorizer { + virtual void printDebugTracesAtStart(){}; + virtual void printDebugTracesAtEnd(){}; + ++ /// Check legality of given SVML call instruction \p VecCall generated for ++ /// scalar call \p Call. If illegal then the appropriate legal instruction ++ /// is returned. ++ Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call); ++ ++ /// Returns the legal VF for a call instruction \p CI using TTI information ++ /// and vector type. ++ ElementCount getLegalVFForCall(CallInst *CI); ++ ++ /// Partially vectorize a given call \p Call by breaking it down into multiple ++ /// calls of \p LegalCall, decided by the variant VF \p LegalVF. ++ Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall, ++ unsigned LegalVF); ++ ++ /// Generate shufflevector instruction for a vector value \p V based on the ++ /// current \p Part and a smaller VF \p LegalVF. ++ Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part); ++ ++ /// Combine partially vectorized calls stored in \p CallResults. ++ Value *combinePartialVecCalls(SmallVectorImpl &CallResults); ++ + /// The original loop. + Loop *OrigLoop; + +@@ -4596,6 +4617,17 @@ static bool mayDivideByZero(Instruction &I) { + return !CInt || CInt->isZero(); + } + ++static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL, ++ const TargetLibraryInfo &TLI) { ++ Function *VectorF = CI.getCalledFunction(); ++ FunctionType *FTy = VectorF->getFunctionType(); ++ StringRef VFName = VectorF->getName(); ++ auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL); ++ if (CC) { ++ CI.setCallingConv(*CC); ++ } ++} ++ + void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, + VPUser &ArgOperands, + VPTransformState &State) { +@@ -4664,9 +4696,246 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, + if (isa(V)) + V->copyFastMathFlags(CI); + ++ const DataLayout &DL = V->getModule()->getDataLayout(); ++ setVectorFunctionCallingConv(*V, DL, *TLI); ++ ++ // Perform legalization of SVML call instruction only if original call ++ // was not Intrinsic ++ if (!UseVectorIntrinsic && ++ (V->getCalledFunction()->getName()).startswith("__svml")) { ++ // assert((V->getCalledFunction()->getName()).startswith("__svml")); ++ LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump()); ++ auto *LegalV = cast(legalizeSVMLCall(V, CI)); ++ LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: "; ++ LegalV->dump()); ++ State.set(Def, LegalV, Part); ++ addMetadata(LegalV, &I); ++ } else { + State.set(Def, V, Part); + addMetadata(V, &I); ++ } ++ } ++} ++ ++//===----------------------------------------------------------------------===// ++// Implementation of functions for SVML vector call legalization. ++//===----------------------------------------------------------------------===// ++// ++// Unlike other VECLIBs, SVML needs to be used with target-legal ++// vector types. Otherwise, link failures and/or runtime failures ++// will occur. A motivating example could be - ++// ++// double *a; ++// float *b; ++// #pragma clang loop vectorize_width(8) ++// for(i = 0; i < N; ++i) { ++// a[i] = sin(i); // Legal SVML VF must be 4 or below on AVX ++// b[i] = cosf(i); // VF can be 8 on AVX since 8 floats can fit in YMM ++// } ++// ++// Current implementation of vector code generation in LV is ++// driven based on a single VF (in InnerLoopVectorizer::VF). This ++// inhibits the flexibility of adjusting/choosing different VF ++// for different instructions. ++// ++// Due to this limitation it is much more straightforward to ++// first generate the illegal sin8 (svml_sin8 for SVML vector ++// library) call and then legalize it than trying to avoid ++// generating illegal code from the beginning. ++// ++// A solution for this problem is to check legality of the ++// call instruction right after generating it in vectorizer and ++// if it is illegal we split the call arguments and issue multiple ++// calls to match the legal VF. This is demonstrated currently for ++// the SVML vector library calls (non-intrinsic version only). ++// ++// Future directions and extensions: ++// 1) This legalization example shows us that a good direction ++// for the VPlan framework would be to model the vector call ++// instructions in a way that legal VF for each call is chosen ++// correctly within vectorizer and illegal code generation is ++// avoided. ++// 2) This logic can also be extended to general vector functions ++// i.e. legalization OpenMP decalre simd functions. The ++// requirements needed for this will be documented soon. ++ ++Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall, ++ CallInst *Call) { ++ ElementCount LegalVF = getLegalVFForCall(VecCall); ++ ++ assert(LegalVF.getKnownMinValue() > 1 && ++ "Legal VF for SVML call must be greater than 1 to vectorize"); ++ ++ if (LegalVF == VF) ++ return VecCall; ++ else if (LegalVF.getKnownMinValue() > VF.getKnownMinValue()) ++ // TODO: handle case when we are underfilling vectors ++ return VecCall; ++ ++ // Legal VF for this SVML call is smaller than chosen VF, break it down into ++ // smaller call instructions ++ ++ // Convert args, types and return type to match legal VF ++ SmallVector NewTys; ++ SmallVector NewArgs; ++ ++ for (Value *ArgOperand : Call->args()) { ++ Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF); ++ NewTys.push_back(Ty); ++ NewArgs.push_back(UndefValue::get(Ty)); + } ++ ++ // Construct legal vector function ++ const VFShape Shape = ++ VFShape::get(*Call, LegalVF /*EC*/, false /*HasGlobalPred*/); ++ Function *LegalVectorF = VFDatabase(*Call).getVectorizedFunction(Shape); ++ assert(LegalVectorF != nullptr && "Can't create legal vector function."); ++ ++ LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump()); ++ ++ SmallVector OpBundles; ++ Call->getOperandBundlesAsDefs(OpBundles); ++ auto LegalV = std::unique_ptr(CallInst::Create(LegalVectorF, NewArgs, OpBundles)); ++ ++ if (isa(LegalV)) ++ LegalV->copyFastMathFlags(Call); ++ ++ const DataLayout &DL = VecCall->getModule()->getDataLayout(); ++ // Set SVML calling conventions ++ setVectorFunctionCallingConv(*LegalV, DL, *TLI); ++ ++ LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump()); ++ ++ Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV.get(), LegalVF.getKnownMinValue()); ++ ++ LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump()); ++ ++ // Remove the illegal call from Builder ++ VecCall->eraseFromParent(); ++ ++ return LegalizedCall; ++} ++ ++ElementCount InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) { ++ const DataLayout DL = CI->getModule()->getDataLayout(); ++ FunctionType *CallFT = CI->getFunctionType(); ++ // All functions that need legalization should have a vector return type. ++ // This is true for all SVML functions that are currently supported. ++ assert(isa(CallFT->getReturnType()) && ++ "Return type of call that needs legalization is not a vector."); ++ auto *VecCallRetType = cast(CallFT->getReturnType()); ++ Type *ElemType = VecCallRetType->getElementType(); ++ ++ unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType); ++ unsigned VectorBitWidth = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); ++ unsigned LegalVF = VectorBitWidth / TypeBitWidth; ++ ++ LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n"); ++ LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n"); ++ LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth ++ << "\n"); ++ LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n"); ++ ++ return ElementCount::getFixed(LegalVF); ++} ++ ++// Partial vectorization of a call instruction is achieved by making clones of ++// \p LegalCall and overwriting its argument operands with shufflevector ++// equivalent decided based on \p LegalVF and current Part being filled. ++Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call, ++ CallInst *LegalCall, ++ unsigned LegalVF) { ++ unsigned NumParts = VF.getKnownMinValue() / LegalVF; ++ LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n"); ++ SmallVector CallResults; ++ ++ for (unsigned Part = 0; Part < NumParts; ++Part) { ++ auto *ClonedCall = cast(LegalCall->clone()); ++ ++ // Update the arg operand of cloned call to shufflevector ++ for (unsigned i = 0, ie = Call->arg_size(); i != ie; ++i) { ++ auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part); ++ ClonedCall->setArgOperand(i, NewOp); ++ } ++ ++ LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump()); ++ ++ auto *PartialVecCall = Builder.Insert(ClonedCall); ++ CallResults.push_back(PartialVecCall); ++ } ++ ++ return combinePartialVecCalls(CallResults); ++} ++ ++Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF, ++ unsigned Part) { ++ // Example: ++ // Consider the following vector code - ++ // %1 = sitofp <4 x i32> %0 to <4 x double> ++ // %2 = call <4 x double> @__svml_sin4(<4 x double> %1) ++ // ++ // If the LegalVF is 2, we partially vectorize the sin4 call by invoking ++ // generateShuffleValue on the operand %1 ++ // If Part = 1, output value is - ++ // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> ++ // and if Part = 2, output is - ++ // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> ++ ++ assert(isa(V->getType()) && ++ "Cannot generate shuffles for non-vector values."); ++ SmallVector ShuffleMask; ++ Value *Undef = UndefValue::get(V->getType()); ++ ++ unsigned ElemIdx = Part * LegalVF; ++ ++ for (unsigned K = 0; K < LegalVF; K++) ++ ShuffleMask.push_back(static_cast(ElemIdx + K)); ++ ++ auto *ShuffleInst = ++ Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle"); ++ ++ return ShuffleInst; ++} ++ ++// Results of the calls executed by smaller legal call instructions must be ++// combined to match the original VF for later use. This is done by constructing ++// shufflevector instructions in a cumulative fashion. ++Value *InnerLoopVectorizer::combinePartialVecCalls( ++ SmallVectorImpl &CallResults) { ++ assert(isa(CallResults[0]->getType()) && ++ "Cannot combine calls with non-vector results."); ++ auto *CallType = cast(CallResults[0]->getType()); ++ ++ Value *CombinedShuffle; ++ unsigned NumElems = CallType->getElementCount().getKnownMinValue() * 2; ++ unsigned NumRegs = CallResults.size(); ++ ++ assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) && ++ "Number of partial vector calls to combine must be a power of 2 " ++ "(atleast 2^1)"); ++ ++ while (NumRegs > 1) { ++ for (unsigned I = 0; I < NumRegs; I += 2) { ++ SmallVector ShuffleMask; ++ for (unsigned J = 0; J < NumElems; J++) ++ ShuffleMask.push_back(static_cast(J)); ++ ++ CombinedShuffle = Builder.CreateShuffleVector( ++ CallResults[I], CallResults[I + 1], ShuffleMask, "combined"); ++ LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:"; ++ CombinedShuffle->dump()); ++ CallResults.push_back(CombinedShuffle); ++ } ++ ++ SmallVector::iterator Start = CallResults.begin(); ++ SmallVector::iterator End = Start + NumRegs; ++ CallResults.erase(Start, End); ++ ++ NumElems *= 2; ++ NumRegs /= 2; ++ } ++ ++ return CombinedShuffle; + } + + void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { +diff --git a/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll +index df8b7c498bd00..63a36549f18fd 100644 +--- a/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll ++++ b/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll +@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu" + define <4 x double> @exp_v4(<4 x double> %in) { + ; SVML-LABEL: define {{[^@]+}}@exp_v4 + ; SVML-SAME: (<4 x double> [[IN:%.*]]) { +-; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[IN]]) ++; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4_ha(<4 x double> [[IN]]) + ; SVML-NEXT: ret <4 x double> [[TMP1]] + ; + ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_v4 +@@ -37,7 +37,7 @@ declare <4 x double> @llvm.exp.v4f64(<4 x double>) #0 + define <4 x float> @exp_f32(<4 x float> %in) { + ; SVML-LABEL: define {{[^@]+}}@exp_f32 + ; SVML-SAME: (<4 x float> [[IN:%.*]]) { +-; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[IN]]) ++; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4_ha(<4 x float> [[IN]]) + ; SVML-NEXT: ret <4 x float> [[TMP1]] + ; + ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_f32 +diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll +index a6e191c3d6923..d6e2e11106949 100644 +--- a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll ++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll +@@ -39,7 +39,8 @@ for.end: ; preds = %for.body + declare double @__exp_finite(double) #0 + + ; CHECK-LABEL: @exp_f64 +-; CHECK: <4 x double> @__svml_exp4 ++; CHECK: <2 x double> @__svml_exp2 ++; CHECK: <2 x double> @__svml_exp2 + ; CHECK: ret + define void @exp_f64(double* nocapture %varray) { + entry: +@@ -99,7 +100,8 @@ for.end: ; preds = %for.body + declare double @__log_finite(double) #0 + + ; CHECK-LABEL: @log_f64 +-; CHECK: <4 x double> @__svml_log4 ++; CHECK: <2 x double> @__svml_log2 ++; CHECK: <2 x double> @__svml_log2 + ; CHECK: ret + define void @log_f64(double* nocapture %varray) { + entry: +@@ -159,7 +161,8 @@ for.end: ; preds = %for.body + declare double @__pow_finite(double, double) #0 + + ; CHECK-LABEL: @pow_f64 +-; CHECK: <4 x double> @__svml_pow4 ++; CHECK: <2 x double> @__svml_pow2 ++; CHECK: <2 x double> @__svml_pow2 + ; CHECK: ret + define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { + entry: +@@ -190,7 +193,8 @@ declare float @__exp2f_finite(float) #0 + + define void @exp2f_finite(float* nocapture %varray) { + ; CHECK-LABEL: @exp2f_finite( +-; CHECK: call <4 x float> @__svml_exp2f4(<4 x float> %{{.*}}) ++; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}}) ++; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}}) + ; CHECK: ret void + ; + entry: +@@ -219,7 +223,8 @@ declare double @__exp2_finite(double) #0 + + define void @exp2_finite(double* nocapture %varray) { + ; CHECK-LABEL: @exp2_finite( +-; CHECK: call <4 x double> @__svml_exp24(<4 x double> {{.*}}) ++; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}}) ++; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}}) + ; CHECK: ret void + ; + entry: +@@ -276,7 +281,8 @@ for.end: ; preds = %for.body + declare double @__log2_finite(double) #0 + + ; CHECK-LABEL: @log2_f64 +-; CHECK: <4 x double> @__svml_log24 ++; CHECK: <2 x double> @__svml_log22 ++; CHECK: <2 x double> @__svml_log22 + ; CHECK: ret + define void @log2_f64(double* nocapture %varray) { + entry: +@@ -333,7 +339,8 @@ for.end: ; preds = %for.body + declare double @__log10_finite(double) #0 + + ; CHECK-LABEL: @log10_f64 +-; CHECK: <4 x double> @__svml_log104 ++; CHECK: <2 x double> @__svml_log102 ++; CHECK: <2 x double> @__svml_log102 + ; CHECK: ret + define void @log10_f64(double* nocapture %varray) { + entry: +@@ -390,7 +397,8 @@ for.end: ; preds = %for.body + declare double @__sqrt_finite(double) #0 + + ; CHECK-LABEL: @sqrt_f64 +-; CHECK: <4 x double> @__svml_sqrt4 ++; CHECK: <2 x double> @__svml_sqrt2 ++; CHECK: <2 x double> @__svml_sqrt2 + ; CHECK: ret + define void @sqrt_f64(double* nocapture %varray) { + entry: +diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll +index 42c280df6ad02..088bbdcf1aa4a 100644 +--- a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll ++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll +@@ -48,7 +48,7 @@ declare float @llvm.exp2.f32(float) #0 + + define void @sin_f64(double* nocapture %varray) { + ; CHECK-LABEL: @sin_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -71,7 +71,7 @@ for.end: + + define void @sin_f32(float* nocapture %varray) { + ; CHECK-LABEL: @sin_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -94,7 +94,7 @@ for.end: + + define void @sin_f64_intrinsic(double* nocapture %varray) { + ; CHECK-LABEL: @sin_f64_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -117,7 +117,7 @@ for.end: + + define void @sin_f32_intrinsic(float* nocapture %varray) { + ; CHECK-LABEL: @sin_f32_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -140,7 +140,7 @@ for.end: + + define void @cos_f64(double* nocapture %varray) { + ; CHECK-LABEL: @cos_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -163,7 +163,7 @@ for.end: + + define void @cos_f32(float* nocapture %varray) { + ; CHECK-LABEL: @cos_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -186,7 +186,7 @@ for.end: + + define void @cos_f64_intrinsic(double* nocapture %varray) { + ; CHECK-LABEL: @cos_f64_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -209,7 +209,7 @@ for.end: + + define void @cos_f32_intrinsic(float* nocapture %varray) { + ; CHECK-LABEL: @cos_f32_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -232,7 +232,7 @@ for.end: + + define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { + ; CHECK-LABEL: @pow_f64( +-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ++; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -257,7 +257,7 @@ for.end: + + define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { + ; CHECK-LABEL: @pow_f64_intrinsic( +-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ++; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -282,7 +282,7 @@ for.end: + + define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { + ; CHECK-LABEL: @pow_f32( +-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ++; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -307,7 +307,7 @@ for.end: + + define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { + ; CHECK-LABEL: @pow_f32_intrinsic( +-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ++; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -332,7 +332,7 @@ for.end: + + define void @exp_f64(double* nocapture %varray) { + ; CHECK-LABEL: @exp_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -355,7 +355,7 @@ for.end: + + define void @exp_f32(float* nocapture %varray) { + ; CHECK-LABEL: @exp_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -378,7 +378,7 @@ for.end: + + define void @exp_f64_intrinsic(double* nocapture %varray) { + ; CHECK-LABEL: @exp_f64_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -401,7 +401,7 @@ for.end: + + define void @exp_f32_intrinsic(float* nocapture %varray) { + ; CHECK-LABEL: @exp_f32_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -424,7 +424,7 @@ for.end: + + define void @log_f64(double* nocapture %varray) { + ; CHECK-LABEL: @log_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -447,7 +447,7 @@ for.end: + + define void @log_f32(float* nocapture %varray) { + ; CHECK-LABEL: @log_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -470,7 +470,7 @@ for.end: + + define void @log_f64_intrinsic(double* nocapture %varray) { + ; CHECK-LABEL: @log_f64_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -493,7 +493,7 @@ for.end: + + define void @log_f32_intrinsic(float* nocapture %varray) { + ; CHECK-LABEL: @log_f32_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -516,7 +516,7 @@ for.end: + + define void @log2_f64(double* nocapture %varray) { + ; CHECK-LABEL: @log2_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -539,7 +539,7 @@ for.end: + + define void @log2_f32(float* nocapture %varray) { + ; CHECK-LABEL: @log2_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -562,7 +562,7 @@ for.end: + + define void @log2_f64_intrinsic(double* nocapture %varray) { + ; CHECK-LABEL: @log2_f64_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -585,7 +585,7 @@ for.end: + + define void @log2_f32_intrinsic(float* nocapture %varray) { + ; CHECK-LABEL: @log2_f32_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -608,7 +608,7 @@ for.end: + + define void @log10_f64(double* nocapture %varray) { + ; CHECK-LABEL: @log10_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -631,7 +631,7 @@ for.end: + + define void @log10_f32(float* nocapture %varray) { + ; CHECK-LABEL: @log10_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -654,7 +654,7 @@ for.end: + + define void @log10_f64_intrinsic(double* nocapture %varray) { + ; CHECK-LABEL: @log10_f64_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -677,7 +677,7 @@ for.end: + + define void @log10_f32_intrinsic(float* nocapture %varray) { + ; CHECK-LABEL: @log10_f32_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -700,7 +700,7 @@ for.end: + + define void @sqrt_f64(double* nocapture %varray) { + ; CHECK-LABEL: @sqrt_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sqrt4_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -723,7 +723,7 @@ for.end: + + define void @sqrt_f32(float* nocapture %varray) { + ; CHECK-LABEL: @sqrt_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sqrtf4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -746,7 +746,7 @@ for.end: + + define void @exp2_f64(double* nocapture %varray) { + ; CHECK-LABEL: @exp2_f64( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -769,7 +769,7 @@ for.end: + + define void @exp2_f32(float* nocapture %varray) { + ; CHECK-LABEL: @exp2_f32( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -792,7 +792,7 @@ for.end: + + define void @exp2_f64_intrinsic(double* nocapture %varray) { + ; CHECK-LABEL: @exp2_f64_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -815,7 +815,7 @@ for.end: + + define void @exp2_f32_intrinsic(float* nocapture %varray) { + ; CHECK-LABEL: @exp2_f32_intrinsic( +-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) ++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]]) + ; CHECK: ret void + ; + entry: +@@ -836,4 +836,44 @@ for.end: + ret void + } + ++; CHECK-LABEL: @atan2_finite ++; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24( ++; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24( ++; CHECK: ret ++ ++declare double @__atan2_finite(double, double) local_unnamed_addr #0 ++ ++define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 { ++entry: ++ br label %for.cond1.preheader ++ ++for.cond1.preheader: ; preds = %for.inc7, %entry ++ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ] ++ %0 = trunc i64 %indvars.iv19 to i32 ++ %conv = sitofp i32 %0 to double ++ br label %for.body3 ++ ++for.body3: ; preds = %for.body3, %for.cond1.preheader ++ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] ++ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ++ %1 = trunc i64 %indvars.iv.next to i32 ++ %conv4 = sitofp i32 %1 to double ++ %call = tail call fast double @__atan2_finite(double %conv, double %conv4) ++ %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv ++ store double %call, double* %arrayidx6, align 8 ++ %exitcond = icmp eq i64 %indvars.iv.next, 100 ++ br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5 ++ ++for.inc7: ; preds = %for.body3 ++ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 ++ %exitcond21 = icmp eq i64 %indvars.iv.next20, 100 ++ br i1 %exitcond21, label %for.end9, label %for.cond1.preheader ++ ++for.end9: ; preds = %for.inc7 ++ ret void ++} ++ + attributes #0 = { nounwind readnone } ++!5 = distinct !{!5, !6, !7} ++!6 = !{!"llvm.loop.vectorize.width", i32 8} ++!7 = !{!"llvm.loop.vectorize.enable", i1 true} +diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll +new file mode 100644 +index 0000000000000..326c763994343 +--- /dev/null ++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll +@@ -0,0 +1,513 @@ ++; Check legalization of SVML calls, including intrinsic versions (like @llvm..). ++ ++; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s ++ ++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ++target triple = "x86_64-unknown-linux-gnu" ++ ++declare double @sin(double) #0 ++declare float @sinf(float) #0 ++declare double @llvm.sin.f64(double) #0 ++declare float @llvm.sin.f32(float) #0 ++ ++declare double @cos(double) #0 ++declare float @cosf(float) #0 ++declare double @llvm.cos.f64(double) #0 ++declare float @llvm.cos.f32(float) #0 ++ ++declare double @pow(double, double) #0 ++declare float @powf(float, float) #0 ++declare double @llvm.pow.f64(double, double) #0 ++declare float @llvm.pow.f32(float, float) #0 ++ ++declare double @exp(double) #0 ++declare float @expf(float) #0 ++declare double @llvm.exp.f64(double) #0 ++declare float @llvm.exp.f32(float) #0 ++ ++declare double @log(double) #0 ++declare float @logf(float) #0 ++declare double @llvm.log.f64(double) #0 ++declare float @llvm.log.f32(float) #0 ++ ++ ++define void @sin_f64(double* nocapture %varray) { ++; CHECK-LABEL: @sin_f64( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @sin(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @sin_f32(float* nocapture %varray) { ++; CHECK-LABEL: @sin_f32( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @sinf(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @sin_f64_intrinsic(double* nocapture %varray) { ++; CHECK-LABEL: @sin_f64_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @llvm.sin.f64(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @sin_f32_intrinsic(float* nocapture %varray) { ++; CHECK-LABEL: @sin_f32_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @llvm.sin.f32(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @cos_f64(double* nocapture %varray) { ++; CHECK-LABEL: @cos_f64( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @cos(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @cos_f32(float* nocapture %varray) { ++; CHECK-LABEL: @cos_f32( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @cosf(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @cos_f64_intrinsic(double* nocapture %varray) { ++; CHECK-LABEL: @cos_f64_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @llvm.cos.f64(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @cos_f32_intrinsic(float* nocapture %varray) { ++; CHECK-LABEL: @cos_f32_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @llvm.cos.f32(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { ++; CHECK-LABEL: @pow_f64( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]]) ++; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv ++ %tmp1 = load double, double* %arrayidx, align 4 ++ %tmp2 = tail call double @pow(double %conv, double %tmp1) ++ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %tmp2, double* %arrayidx2, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { ++; CHECK-LABEL: @pow_f64_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]]) ++; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv ++ %tmp1 = load double, double* %arrayidx, align 4 ++ %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1) ++ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %tmp2, double* %arrayidx2, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { ++; CHECK-LABEL: @pow_f32( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv ++ %tmp1 = load float, float* %arrayidx, align 4 ++ %tmp2 = tail call float @powf(float %conv, float %tmp1) ++ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %tmp2, float* %arrayidx2, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { ++; CHECK-LABEL: @pow_f32_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv ++ %tmp1 = load float, float* %arrayidx, align 4 ++ %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1) ++ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %tmp2, float* %arrayidx2, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @exp_f64(double* nocapture %varray) { ++; CHECK-LABEL: @exp_f64( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @exp(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @exp_f32(float* nocapture %varray) { ++; CHECK-LABEL: @exp_f32( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @expf(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @exp_f64_intrinsic(double* nocapture %varray) { ++; CHECK-LABEL: @exp_f64_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @llvm.exp.f64(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @exp_f32_intrinsic(float* nocapture %varray) { ++; CHECK-LABEL: @exp_f32_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @llvm.exp.f32(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @log_f64(double* nocapture %varray) { ++; CHECK-LABEL: @log_f64( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @log(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @log_f32(float* nocapture %varray) { ++; CHECK-LABEL: @log_f32( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @logf(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @log_f64_intrinsic(double* nocapture %varray) { ++; CHECK-LABEL: @log_f64_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]]) ++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to double ++ %call = tail call double @llvm.log.f64(double %conv) ++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv ++ store double %call, double* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++define void @log_f32_intrinsic(float* nocapture %varray) { ++; CHECK-LABEL: @log_f32_intrinsic( ++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]]) ++; CHECK: ret void ++; ++entry: ++ br label %for.body ++ ++for.body: ++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ++ %tmp = trunc i64 %iv to i32 ++ %conv = sitofp i32 %tmp to float ++ %call = tail call float @llvm.log.f32(float %conv) ++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv ++ store float %call, float* %arrayidx, align 4 ++ %iv.next = add nuw nsw i64 %iv, 1 ++ %exitcond = icmp eq i64 %iv.next, 1000 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ++ ret void ++} ++ ++attributes #0 = { nounwind readnone } ++ +diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll +new file mode 100644 +index 0000000000000..9422653445dc2 +--- /dev/null ++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll +@@ -0,0 +1,61 @@ ++; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype. ++; The C code used to generate this test: ++ ++; #include ++; ++; void foo(double *a, int N){ ++; int i; ++; #pragma clang loop vectorize_width(8) ++; for (i=0;i [[I0:%.*]] to <8 x double> ++; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> ++; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S1]]) ++; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> ++; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S2]]) ++; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> ++; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8 ++ ++ ++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ++target triple = "x86_64-unknown-linux-gnu" ++ ++; Function Attrs: nounwind uwtable ++define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 { ++entry: ++ %cmp5 = icmp sgt i32 %N, 0 ++ br i1 %cmp5, label %for.body.preheader, label %for.end ++ ++for.body.preheader: ; preds = %entry ++ %wide.trip.count = zext i32 %N to i64 ++ br label %for.body ++ ++for.body: ; preds = %for.body, %for.body.preheader ++ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ++ %0 = trunc i64 %indvars.iv to i32 ++ %conv = sitofp i32 %0 to double ++ %call = tail call fast double @sin(double %conv) #2 ++ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv ++ store double %call, double* %arrayidx, align 8, !tbaa !2 ++ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ++ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count ++ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6 ++ ++for.end: ; preds = %for.body, %entry ++ ret void ++} ++ ++; Function Attrs: nounwind ++declare dso_local double @sin(double) local_unnamed_addr #1 ++ ++!2 = !{!3, !3, i64 0} ++!3 = !{!"double", !4, i64 0} ++!4 = !{!"omnipotent char", !5, i64 0} ++!5 = !{!"Simple C/C++ TBAA"} ++!6 = distinct !{!6, !7} ++!7 = !{!"llvm.loop.vectorize.width", i32 8} +diff --git a/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll b/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll +index e8c83c4d9bd1f..615fdc29176a2 100644 +--- a/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll ++++ b/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll +@@ -12,12 +12,12 @@ target triple = "x86_64-unknown-linux-gnu" + + ; COMMON-LABEL: @llvm.compiler.used = appending global + ; SVML-SAME: [6 x i8*] [ +-; SVML-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__svml_sin2 to i8*), +-; SVML-SAME: i8* bitcast (<4 x double> (<4 x double>)* @__svml_sin4 to i8*), +-; SVML-SAME: i8* bitcast (<8 x double> (<8 x double>)* @__svml_sin8 to i8*), +-; SVML-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__svml_log10f4 to i8*), +-; SVML-SAME: i8* bitcast (<8 x float> (<8 x float>)* @__svml_log10f8 to i8*), +-; SVML-SAME: i8* bitcast (<16 x float> (<16 x float>)* @__svml_log10f16 to i8*) ++; SVML-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__svml_sin2_ha to i8*), ++; SVML-SAME: i8* bitcast (<4 x double> (<4 x double>)* @__svml_sin4_ha to i8*), ++; SVML-SAME: i8* bitcast (<8 x double> (<8 x double>)* @__svml_sin8_ha to i8*), ++; SVML-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__svml_log10f4_ha to i8*), ++; SVML-SAME: i8* bitcast (<8 x float> (<8 x float>)* @__svml_log10f8_ha to i8*), ++; SVML-SAME: i8* bitcast (<16 x float> (<16 x float>)* @__svml_log10f16_ha to i8*) + ; MASSV-SAME: [2 x i8*] [ + ; MASSV-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__sind2 to i8*), + ; MASSV-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__log10f4 to i8*) +@@ -59,9 +59,9 @@ declare float @llvm.log10.f32(float) #0 + attributes #0 = { nounwind readnone } + + ; SVML: attributes #[[SIN]] = { "vector-function-abi-variant"= +-; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2), +-; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4), +-; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8)" } ++; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2_ha), ++; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4_ha), ++; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8_ha)" } + + ; MASSV: attributes #[[SIN]] = { "vector-function-abi-variant"= + ; MASSV-SAME: "_ZGV_LLVM_N2v_sin(__sind2)" } +diff --git a/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt b/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt +index 97df6a55d1b59..199e0285c9e5d 100644 +--- a/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt ++++ b/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt +@@ -47,6 +47,7 @@ add_tablegen(llvm-tblgen LLVM + SearchableTableEmitter.cpp + SubtargetEmitter.cpp + SubtargetFeatureInfo.cpp ++ SVMLEmitter.cpp + TableGen.cpp + Types.cpp + X86DisassemblerTables.cpp +diff --git a/llvm-14.0.6.src/utils/TableGen/SVMLEmitter.cpp b/llvm-14.0.6.src/utils/TableGen/SVMLEmitter.cpp +new file mode 100644 +index 0000000000000..a5aeea48db28b +--- /dev/null ++++ b/llvm-14.0.6.src/utils/TableGen/SVMLEmitter.cpp +@@ -0,0 +1,110 @@ ++//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This tablegen backend emits the scalar to svml function map for TLI. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "CodeGenTarget.h" ++#include "llvm/Support/Format.h" ++#include "llvm/TableGen/Error.h" ++#include "llvm/TableGen/Record.h" ++#include "llvm/TableGen/TableGenBackend.h" ++#include ++#include ++ ++using namespace llvm; ++ ++#define DEBUG_TYPE "SVMLVariants" ++#include "llvm/Support/Debug.h" ++ ++namespace { ++ ++class SVMLVariantsEmitter { ++ ++ RecordKeeper &Records; ++ ++private: ++ void emitSVMLVariants(raw_ostream &OS); ++ ++public: ++ SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {} ++ ++ void run(raw_ostream &OS); ++}; ++} // End anonymous namespace ++ ++/// \brief Emit the set of SVML variant function names. ++// The default is to emit the high accuracy SVML variants until a mechanism is ++// introduced to allow a selection of different variants through precision ++// requirements specified by the user. This code generates mappings to svml ++// that are in the scalar form of llvm intrinsics, math library calls, or the ++// finite variants of math library calls. ++void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) { ++ ++ const unsigned MinSinglePrecVL = 4; ++ const unsigned MaxSinglePrecVL = 16; ++ const unsigned MinDoublePrecVL = 2; ++ const unsigned MaxDoublePrecVL = 8; ++ ++ OS << "#ifdef GET_SVML_VARIANTS\n"; ++ ++ for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) { ++ StringRef SvmlVariantNameStr = D->getName(); ++ // Single Precision SVML ++ for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) { ++ // Emit the scalar math library function to svml function entry. ++ OS << "{\"" << SvmlVariantNameStr << "f" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " ++ << "ElementCount::getFixed(" << VL << ")},\n"; ++ ++ // Emit the scalar intrinsic to svml function entry. ++ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " ++ << "ElementCount::getFixed(" << VL << ")},\n"; ++ ++ // Emit the finite math library function to svml function entry. ++ OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " ++ << "ElementCount::getFixed(" << VL << ")},\n"; ++ } ++ ++ // Double Precision SVML ++ for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) { ++ // Emit the scalar math library function to svml function entry. ++ OS << "{\"" << SvmlVariantNameStr << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL ++ << ")},\n"; ++ ++ // Emit the scalar intrinsic to svml function entry. ++ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL ++ << ")},\n"; ++ ++ // Emit the finite math library function to svml function entry. ++ OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", "; ++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " ++ << "ElementCount::getFixed(" << VL << ")},\n"; ++ } ++ } ++ ++ OS << "#endif // GET_SVML_VARIANTS\n\n"; ++} ++ ++void SVMLVariantsEmitter::run(raw_ostream &OS) { ++ emitSVMLVariants(OS); ++} ++ ++namespace llvm { ++ ++void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) { ++ SVMLVariantsEmitter(RK).run(OS); ++} ++ ++} // End llvm namespace +diff --git a/llvm-14.0.6.src/utils/TableGen/TableGen.cpp b/llvm-14.0.6.src/utils/TableGen/TableGen.cpp +index 2d4a45f889be6..603d0c223b33a 100644 +--- a/llvm-14.0.6.src/utils/TableGen/TableGen.cpp ++++ b/llvm-14.0.6.src/utils/TableGen/TableGen.cpp +@@ -57,6 +57,7 @@ enum ActionType { + GenAutomata, + GenDirectivesEnumDecl, + GenDirectivesEnumImpl, ++ GenSVMLVariants, + }; + + namespace llvm { +@@ -138,7 +139,9 @@ cl::opt Action( + clEnumValN(GenDirectivesEnumDecl, "gen-directive-decl", + "Generate directive related declaration code (header file)"), + clEnumValN(GenDirectivesEnumImpl, "gen-directive-impl", +- "Generate directive related implementation code"))); ++ "Generate directive related implementation code"), ++ clEnumValN(GenSVMLVariants, "gen-svml", ++ "Generate SVML variant function names"))); + + cl::OptionCategory PrintEnumsCat("Options for -print-enums"); + cl::opt Class("class", cl::desc("Print Enum list for this class"), +@@ -272,6 +275,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { + case GenDirectivesEnumImpl: + EmitDirectivesImpl(Records, OS); + break; ++ case GenSVMLVariants: ++ EmitSVMLVariants(Records, OS); ++ break; + } + + return false; +diff --git a/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h b/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h +index 71db8dc77b052..86c3a3068c2dc 100644 +--- a/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h ++++ b/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h +@@ -93,6 +93,7 @@ void EmitExegesis(RecordKeeper &RK, raw_ostream &OS); + void EmitAutomata(RecordKeeper &RK, raw_ostream &OS); + void EmitDirectivesDecl(RecordKeeper &RK, raw_ostream &OS); + void EmitDirectivesImpl(RecordKeeper &RK, raw_ostream &OS); ++void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS); + + } // End llvm namespace + +diff --git a/llvm-14.0.6.src/utils/vim/syntax/llvm.vim b/llvm-14.0.6.src/utils/vim/syntax/llvm.vim +index 205db16b7d8cd..2572ab5a59e1b 100644 +--- a/llvm-14.0.6.src/utils/vim/syntax/llvm.vim ++++ b/llvm-14.0.6.src/utils/vim/syntax/llvm.vim +@@ -104,6 +104,7 @@ syn keyword llvmKeyword + \ inreg + \ intel_ocl_bicc + \ inteldialect ++ \ intel_svmlcc + \ internal + \ jumptable + \ linkonce diff --git a/conda-recipes/llvmdev/bld.bat b/conda-recipes/llvmdev/bld.bat index 1ce228c80..e48800dc5 100644 --- a/conda-recipes/llvmdev/bld.bat +++ b/conda-recipes/llvmdev/bld.bat @@ -1,3 +1,13 @@ +setlocal EnableDelayedExpansion +FOR /D %%d IN (llvm-*.src) DO (MKLINK /J llvm %%d +if !errorlevel! neq 0 exit /b %errorlevel%) +FOR /D %%d IN (lld-*.src) DO (MKLINK /J lld %%d +if !errorlevel! neq 0 exit /b %errorlevel%) +FOR /D %%d IN (unwind\libunwind-*.src) DO (MKLINK /J libunwind %%d +if !errorlevel! neq 0 exit /b %errorlevel%) + +DIR + mkdir build cd build @@ -24,31 +34,18 @@ REM the 64bit linker anyway. This must be passed in to certain generators as REM '-Thost x64'. set PreferredToolArchitecture=x64 -set MAX_INDEX_CMAKE_GENERATOR=2 - -REM On older generators we can squeete the architecture into the generator -REM name. In newer generators, we must use the -A flag for cmake to hand in the -REM correct architecture. Also, using Visual Studio 16 2019 we use toolset -REM v141, which basically means use a Visual Studio 15 2017 type compiler from -REM Visual Studio 16 2019. See also: -REM https://stackoverflow.com/questions/55708600/whats-the-cmake-generator-for-visual-studio-2019 +set MAX_INDEX_CMAKE_GENERATOR=0 -set "CMAKE_GENERATOR[0]=Visual Studio 14 2015%ARCH_POSTFIX%" -set "CMAKE_GENERATOR[1]=Visual Studio 15 2017%ARCH_POSTFIX%" -set "CMAKE_GENERATOR[2]=Visual Studio 16 2019" +set "CMAKE_GENERATOR[0]=Visual Studio 16 2019" -set "CMAKE_GENERATOR_ARCHITECTURE[0]=" -set "CMAKE_GENERATOR_ARCHITECTURE[1]=" -set "CMAKE_GENERATOR_ARCHITECTURE[2]=%GEN_ARCH%" +set "CMAKE_GENERATOR_ARCHITECTURE[0]=%GEN_ARCH%" -set "CMAKE_GENERATOR_TOOLSET[0]=host %PreferredToolArchitecture%" -set "CMAKE_GENERATOR_TOOLSET[1]=host %PreferredToolArchitecture%" -set "CMAKE_GENERATOR_TOOLSET[2]=v141" +set "CMAKE_GENERATOR_TOOLSET[0]=v142" REM Reduce build times and package size by removing unused stuff REM BENCHMARKS (new for llvm8) don't build under Visual Studio 14 2015 set CMAKE_CUSTOM=-DLLVM_TARGETS_TO_BUILD="%LLVM_TARGETS_TO_BUILD%" ^ - -DLLVM_INCLUDE_TESTS=OFF ^ + -DLLVM_ENABLE_PROJECTS:STRING=lld ^ -DLLVM_INCLUDE_UTILS=ON ^ -DLLVM_INCLUDE_DOCS=OFF ^ -DLLVM_INCLUDE_EXAMPLES=OFF ^ @@ -67,7 +64,7 @@ for /l %%n in (0,1,%MAX_INDEX_CMAKE_GENERATOR%) do ( -DCMAKE_BUILD_TYPE="%BUILD_CONFIG%" ^ -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ -DCMAKE_INSTALL_PREFIX:PATH="%LIBRARY_PREFIX%" ^ - %CMAKE_CUSTOM% "%SRC_DIR%" + %CMAKE_CUSTOM% "%SRC_DIR%\llvm" if not errorlevel 1 goto configuration_successful del CMakeCache.txt ) diff --git a/conda-recipes/llvmdev/build.sh b/conda-recipes/llvmdev/build.sh index fd99eee90..2cc8464c6 100644 --- a/conda-recipes/llvmdev/build.sh +++ b/conda-recipes/llvmdev/build.sh @@ -15,10 +15,14 @@ else DARWIN_TARGET=x86_64-apple-darwin13.4.0 fi +mv llvm-*.src llvm +mv lld-*.src lld +mv unwind/libunwind-*.src libunwind declare -a _cmake_config _cmake_config+=(-DCMAKE_INSTALL_PREFIX:PATH=${PREFIX}) _cmake_config+=(-DCMAKE_BUILD_TYPE:STRING=Release) +_cmake_config+=(-DLLVM_ENABLE_PROJECTS:STRING="lld") # The bootstrap clang I use was built with a static libLLVMObject.a and I trying to get the same here # _cmake_config+=(-DBUILD_SHARED_LIBS:BOOL=ON) _cmake_config+=(-DLLVM_ENABLE_ASSERTIONS:BOOL=ON) @@ -27,6 +31,7 @@ _cmake_config+=(-DLINK_POLLY_INTO_TOOLS:BOOL=ON) _cmake_config+=(-DLLVM_ENABLE_LIBXML2:BOOL=OFF) # Urgh, llvm *really* wants to link to ncurses / terminfo and we *really* do not want it to. _cmake_config+=(-DHAVE_TERMINFO_CURSES=OFF) +_cmake_config+=(-DLLVM_ENABLE_TERMINFO=OFF) # Sometimes these are reported as unused. Whatever. _cmake_config+=(-DHAVE_TERMINFO_NCURSES=OFF) _cmake_config+=(-DHAVE_TERMINFO_NCURSESW=OFF) @@ -39,10 +44,10 @@ _cmake_config+=(-DLLVM_ENABLE_RTTI=OFF) _cmake_config+=(-DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD}) _cmake_config+=(-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=WebAssembly) _cmake_config+=(-DLLVM_INCLUDE_UTILS=ON) # for llvm-lit +_cmake_config+=(-DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF) # doesn't build without the rest of LLVM project # TODO :: It would be nice if we had a cross-ecosystem 'BUILD_TIME_LIMITED' env var we could use to # disable these unnecessary but useful things. if [[ ${CONDA_FORGE} == yes ]]; then - _cmake_config+=(-DLLVM_INCLUDE_TESTS=OFF) _cmake_config+=(-DLLVM_INCLUDE_DOCS=OFF) _cmake_config+=(-DLLVM_INCLUDE_EXAMPLES=OFF) fi @@ -76,7 +81,7 @@ cd build cmake -G'Unix Makefiles' \ "${_cmake_config[@]}" \ - .. + ../llvm ARCH=`uname -m` if [ $ARCH == 'armv7l' ]; then # RPi need thread count throttling @@ -85,6 +90,8 @@ else make -j${CPU_COUNT} VERBOSE=1 fi +make check-llvm-unit || exit $? + # From: https://github.com/conda-forge/llvmdev-feedstock/pull/53 make install || exit $? @@ -93,10 +100,3 @@ if [[ $ARCH == 'x86_64' ]]; then bin/opt -S -vector-library=SVML -mcpu=haswell -O3 $RECIPE_DIR/numba-3016.ll | bin/FileCheck $RECIPE_DIR/numba-3016.ll || exit $? fi -# run the tests, skip some on linux-32 -cd ../test -if [[ $ARCH == 'i686' ]]; then - ../build/bin/llvm-lit -vv Transforms Analysis CodeGen/X86 -else - ../build/bin/llvm-lit -vv Transforms ExecutionEngine Analysis CodeGen/X86 -fi diff --git a/conda-recipes/llvmdev/meta.yaml b/conda-recipes/llvmdev/meta.yaml index 27b596ffc..1a8e67032 100644 --- a/conda-recipes/llvmdev/meta.yaml +++ b/conda-recipes/llvmdev/meta.yaml @@ -1,8 +1,9 @@ -{% set shortversion = "11.1" %} -{% set version = "11.1.0" %} -{% set sha256_llvm = "ce8508e318a01a63d4e8b3090ab2ded3c598a50258cc49e2625b9120d4c03ea5" %} -{% set sha256_lld = "017a788cbe1ecc4a949abf10755870519086d058a2e99f438829aef24f0c66ce" %} -{% set build_number = "5" %} +{% set shortversion = "14.0" %} +{% set version = "14.0.6" %} +{% set sha256_llvm = "050922ecaaca5781fdf6631ea92bc715183f202f9d2f15147226f023414f619a" %} +{% set sha256_lld = "0c28ce0496934d37d20fec96591032dd66af8d10178a45762e0e75e85cf95ad3" %} +{% set sha256_libunwind = "3bbe9c23c73259fe39c045dc87d0b283236ba6e00750a226b2c2aeac4a51d86b" %} +{% set build_number = "0" %} package: name: llvmdev @@ -13,20 +14,16 @@ source: fn: llvm-{{ version }}.src.tar.xz sha256: {{ sha256_llvm }} patches: - - ../partial-testing.patch - # Intel SVML optimizations (two patches) - - ../intel-D47188-svml-VF.patch - # Second patch from https://github.com/conda-forge/llvmdev-feedstock/blob/c706309/recipe/patches/expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch - - ../expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch - # Reverts a patch limiting non-GlobalValue name length - - ../0001-Revert-Limit-size-of-non-GlobalValue-name.patch - # Fixes for aarch64 on LLVM 11 from https://reviews.llvm.org/D104123 - - ../llvm_11_consecutive_registers.patch - + - ../llvm14-remove-use-of-clonefile.patch + - ../llvm14-svml.patch - url: https://github.com/llvm/llvm-project/releases/download/llvmorg-{{ version }}/lld-{{ version }}.src.tar.xz fn: lld-{{ version }}.src.tar.xz sha256: {{ sha256_lld }} - folder: tools/lld + + - url: https://github.com/llvm/llvm-project/releases/download/llvmorg-{{ version }}/libunwind-{{ version }}.src.tar.xz + fn: libunwind-{{ version }}.src.tar.xz + sha256: {{ sha256_libunwind }} + folder: unwind build: number: {{ build_number }} @@ -81,5 +78,5 @@ about: home: http://llvm.org/ dev_url: https://github.com/llvm-mirror/llvm license: NCSA - license_file: LICENSE.TXT + license_file: llvm/LICENSE.TXT summary: Development headers and libraries for LLVM diff --git a/conda-recipes/llvmlite/bld.bat b/conda-recipes/llvmlite/bld.bat index 475a0637c..d7342e249 100755 --- a/conda-recipes/llvmlite/bld.bat +++ b/conda-recipes/llvmlite/bld.bat @@ -12,11 +12,8 @@ if "%ARCH%"=="32" ( @rem set CMAKE_GENERATOR_ARCH=Win64 set CMAKE_GENERATOR_ARCH=x64 ) -@rem for older VS: -@rem set CMAKE_GENERATOR=Visual Studio 15 2017 -@rem do not set CMAKE_GENERATOR_TOOLKIT set CMAKE_GENERATOR=Visual Studio 16 2019 -set CMAKE_GENERATOR_TOOLKIT=v141 +set CMAKE_GENERATOR_TOOLKIT=v142 @rem Ensure there are no build leftovers (CMake can complain) if exist ffi\build rmdir /S /Q ffi\build diff --git a/conda-recipes/llvmlite/meta.yaml b/conda-recipes/llvmlite/meta.yaml index bf083becd..ff897f7af 100644 --- a/conda-recipes/llvmlite/meta.yaml +++ b/conda-recipes/llvmlite/meta.yaml @@ -1,4 +1,4 @@ -{% set VERSION_SUFFIX = "" %} # debug version suffix, appended to the version +{% set VERSION_SUFFIX = "llvm14" %} # debug version suffix, appended to the version package: name: llvmlite @@ -23,19 +23,19 @@ requirements: # build.sh deals with it! - {{ compiler('c') }} # [not (osx or armv6l or armv7l or win)] - {{ compiler('cxx') }} # [not (osx or armv6l or armv7l or win)] - - vs2017_{{ target_platform }} # [win] + - vs2015_{{ target_platform }} # [win] # The DLL build uses cmake on Windows - cmake # [win] - make # [unix and not (armv6l or armv7l or aarch64)] host: - python # On channel https://anaconda.org/numba/ - - llvmdev 11.1.0 *5 # [(osx and arm64)] - - llvmdev 11.1.0 *4 # [not ((osx and arm64) or win)] - - llvmdev 11.1.0 4 # [win] + - llvmdev 14 - vs2015_runtime # [win] # llvmdev is built with libz compression support - zlib # [unix and not (armv6l or armv7l)] + # requires libxml2 + - libxml2 # [win] run: - python >=3.8,<3.10 - vs2015_runtime # [win] diff --git a/ffi/Makefile.freebsd b/ffi/Makefile.freebsd index ba727e331..7b869e876 100644 --- a/ffi/Makefile.freebsd +++ b/ffi/Makefile.freebsd @@ -1,5 +1,5 @@ -CXX = clang++ -std=c++11 -stdlib=libc++ +CXX = clang++ -stdlib=libc++ # -flto and --exclude-libs allow us to remove those parts of LLVM we don't use CXX_FLTO_FLAGS ?= -flto diff --git a/ffi/Makefile.osx b/ffi/Makefile.osx index bc192071e..74dccf32c 100644 --- a/ffi/Makefile.osx +++ b/ffi/Makefile.osx @@ -1,6 +1,6 @@ -CXX = clang++ -std=c++11 -stdlib=libc++ -CXXFLAGS = $(LLVM_CXXFLAGS) +CXX = clang++ +CXXFLAGS = $(LLVM_CXXFLAGS) -O3 # Only export the LLVMPY symbols we require and exclude everything else. EXPORT = "-Wl,-exported_symbol,_LLVMPY_*" LDFLAGS := $(LDFLAGS) $(EXPORT) $(LLVM_LDFLAGS) diff --git a/ffi/build.py b/ffi/build.py index 55343fca5..e58a691e0 100755 --- a/ffi/build.py +++ b/ffi/build.py @@ -72,10 +72,10 @@ def find_windows_generator(): ) generators.extend([ - # use VS2017 toolkit on VS2019 to match how llvmdev is built - ('Visual Studio 16 2019', ('x64' if is_64bit else 'Win32'), 'v141'), - # This is the generator configuration for VS2017 - ('Visual Studio 15 2017' + (' Win64' if is_64bit else ''), None, None) + # use VS2019 to match how llvmdev is built + ('Visual Studio 16 2019', ('x64' if is_64bit else 'Win32'), 'v142'), + # # This is the generator configuration for VS2017 + # ('Visual Studio 15 2017' + (' Win64' if is_64bit else ''), None, None) ]) for generator in generators: build_dir = tempfile.mkdtemp() @@ -163,9 +163,10 @@ def main_posix(kind, library_ext): print(msg) print(warning + '\n') else: - - if not out.startswith('11'): - msg = ("Building llvmlite requires LLVM 11.x.x, got " + (version, _) = out.split('.', 1) + version = int(version) + if version < 11 or version > 14: + msg = ("Building llvmlite requires LLVM 11, 12, 13, or 14, got " "{!r}. Be sure to set LLVM_CONFIG to the right executable " "path.\nRead the documentation at " "http://llvmlite.pydata.org/ for more information about " diff --git a/ffi/passmanagers.cpp b/ffi/passmanagers.cpp index dd67ca5cc..60064cf10 100644 --- a/ffi/passmanagers.cpp +++ b/ffi/passmanagers.cpp @@ -16,11 +16,8 @@ #include "llvm-c/Transforms/IPO.h" #include "llvm-c/Transforms/Scalar.h" -#include "llvm/IR/LegacyPassManager.h" -#if LLVM_VERSION_MAJOR > 11 -#include "llvm/IR/RemarkStreamer.h" -#endif #include "llvm/IR/LLVMRemarkStreamer.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Remarks/RemarkStreamer.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" @@ -220,7 +217,11 @@ LLVMPY_AddLazyValueInfoPass(LLVMPassManagerRef PM) { } API_EXPORT(void) LLVMPY_AddLintPass(LLVMPassManagerRef PM) { +#if LLVM_VERSION_MAJOR < 12 unwrap(PM)->add(llvm::createLintPass()); +#else + unwrap(PM)->add(llvm::createLintLegacyPassPass()); +#endif } API_EXPORT(void) LLVMPY_AddModuleDebugInfoPrinterPass(LLVMPassManagerRef PM) { diff --git a/ffi/targets.cpp b/ffi/targets.cpp index 3b5abf510..b96d22c9f 100644 --- a/ffi/targets.cpp +++ b/ffi/targets.cpp @@ -6,7 +6,11 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Type.h" #include "llvm/Support/Host.h" +#if LLVM_VERSION_MAJOR > 13 +#include "llvm/MC/TargetRegistry.h" +#else #include "llvm/Support/TargetRegistry.h" +#endif #include "llvm/Target/TargetMachine.h" #include @@ -204,7 +208,11 @@ LLVMPY_CreateTargetMachine(LLVMTargetRef T, const char *Triple, const char *CPU, rm = Reloc::DynamicNoPIC; TargetOptions opt; +#if LLVM_VERSION_MAJOR < 12 opt.PrintMachineCode = PrintMC; +#else + opt.MCOptions.ShowMCInst = PrintMC; +#endif opt.MCOptions.ABIName = ABIName; bool jit = JIT; diff --git a/ffi/value.cpp b/ffi/value.cpp index 771acd423..01871699d 100644 --- a/ffi/value.cpp +++ b/ffi/value.cpp @@ -153,8 +153,13 @@ LLVMPY_ArgumentAttributesIter(LLVMValueRef A) { using namespace llvm; Argument *arg = unwrap(A); unsigned argno = arg->getArgNo(); - AttributeSet attrs = - arg->getParent()->getAttributes().getParamAttributes(argno); + const AttributeSet attrs = arg->getParent()->getAttributes(). +#if LLVM_VERSION_MAJOR < 14 + getParamAttributes(argno) +#else + getParamAttrs(argno) +#endif + ; return wrap(new AttributeSetIterator(attrs.begin(), attrs.end())); } @@ -353,7 +358,11 @@ LLVMPY_GetElementType(LLVMTypeRef type) { llvm::Type *unwrapped = llvm::unwrap(type); llvm::PointerType *ty = llvm::dyn_cast(unwrapped); if (ty != nullptr) { +#if LLVM_VERSION_MAJOR < 14 return llvm::wrap(ty->getElementType()); +#else + return llvm::wrap(ty->getPointerElementType()); +#endif } return nullptr; } diff --git a/llvmlite/binding/passmanagers.py b/llvmlite/binding/passmanagers.py index 26f7bd259..4b9daf468 100644 --- a/llvmlite/binding/passmanagers.py +++ b/llvmlite/binding/passmanagers.py @@ -199,7 +199,8 @@ def add_lint_pass(self): """ See https://llvm.org/docs/Passes.html#lint-statically-lint-checks-llvm-ir - LLVM 11+: `llvm::createLintPass` + LLVM 11: `llvm::createLintPass` + LLVM 12+: `llvm::createLintLegacyPassPass` """ # noqa E501 ffi.lib.LLVMPY_AddLintPass(self) diff --git a/llvmlite/tests/test_binding.py b/llvmlite/tests/test_binding.py index dc4dbc484..70902e04c 100644 --- a/llvmlite/tests/test_binding.py +++ b/llvmlite/tests/test_binding.py @@ -640,7 +640,7 @@ def test_set_option(self): def test_version(self): major, minor, patch = llvm.llvm_version_info # one of these can be valid - valid = [(11,)] + valid = [(11,), (12, ), (13, ), (14, )] self.assertIn((major,), valid) self.assertIn(patch, range(10)) From 355338e931f488926b07a2f6eaf83ecd39e9abb7 Mon Sep 17 00:00:00 2001 From: Andre Masella Date: Thu, 8 Dec 2022 17:26:05 -0500 Subject: [PATCH] Automatically detect common return blocks in ref prune Change reference pruning algorithm to detect when a common return block is generated and determine if it return non-zero indicating an exception path. LLVM 14 automatically generates code like this. --- ffi/custom_passes.cpp | 83 ++++++++++++++++++++++++--------- llvmlite/tests/test_refprune.py | 25 +++++++++- 2 files changed, 85 insertions(+), 23 deletions(-) diff --git a/ffi/custom_passes.cpp b/ffi/custom_passes.cpp index 21e0bbcff..a04b4b64c 100644 --- a/ffi/custom_passes.cpp +++ b/ffi/custom_passes.cpp @@ -905,9 +905,7 @@ struct RefPrunePass : public FunctionPass { } /** - * Check if a basic block is a block which raises, this relies on a - * metadata "ret_is_raise" being present the terminator and the - * terminator opcode being Instruction::Ret. + * Check if a basic block is a block which raises, based on the return value. * * Parameters: * - bb a basic block @@ -920,27 +918,68 @@ struct RefPrunePass : public FunctionPass { // Get the terminator auto term = bb->getTerminator(); - // Get the opcode of the terminator, if it's not a Ret then return false - if (term->getOpcode() != Instruction::Ret) - return false; - // Get the metadata on the terminator node - auto md = term->getMetadata("ret_is_raise"); - // If there's no metadata return false (normal or unmarked Ret) - if (!md) - return false; - // If the number of operands on the metadata is not 1 then return false - if (md->getNumOperands() != 1) + // Get the opcode of the terminator, if it's a Ret then check + if (term->getOpcode() == Instruction::Ret) { + // With one operand + if (term->getNumOperands() != 1) { + return false; + } + auto operand = term->getOperand(0); + // If the operand is a constant, check if it indicates an exception + auto int_operand = dyn_cast(operand); + if (int_operand && int_operand->isOneValue()) { + return true; + } + // If the operand is a PHI node, check if there is a non-exception + // path. We don't know which path we're on, but since the + // exceptional path will lookahead, so if there is a non-exceptional + // path, we can assume were on it. + auto phi_operand = dyn_cast(operand); + if (phi_operand) { + for (auto& phi_arg_value : phi_operand->incoming_values()) { + auto arg_value = dyn_cast(phi_arg_value); + if (arg_value && !arg_value->isOneValue()) { + return false; + } + } + return true; + } return false; - // Fetch the ref to the metadata operand at location 0 - auto &operand = md->getOperand(0); - // and then cast the const as Metadata (Numba sets this as literal 1) - auto data = dyn_cast(operand.get()); - // If dyn_cast failed type check then return false - if (!data) + } else if (term->getOpcode() == Instruction::Br && + term->getNumOperands() == 1) { + // If it's a branch, it might be a common return block + auto first = + term->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime(true); + if (!first) { + // Malformed block with no terminal instruction + return false; + } + // Our one and only instruction should be a return + if (first->getOpcode() != Instruction::Ret) { + return false; + } + // With one operand + if (first->getNumOperands() != 1) { + return false; + } + auto operand = first->getOperand(0); + // If the operand is a constant, check if it indicates an exception + auto int_operand = dyn_cast(operand); + if (int_operand && int_operand->isOneValue()) { + return true; + } + // If the operand is a PHI node, check if the path we're on will + // yield a value indicating an exception + auto phi_operand = dyn_cast(operand); + if (phi_operand) { + auto arg_value = dyn_cast( + phi_operand->getIncomingValueForBlock(bb)); + return arg_value && arg_value->isOneValue(); + } + // This path doesn't raise return false; - // get the value of the casted metadata and then return bool on whether - // it is the number one. - return data->getValue()->isOneValue(); + } + return false; } /** diff --git a/llvmlite/tests/test_refprune.py b/llvmlite/tests/test_refprune.py index d4f7b3035..ba53ab1ff 100644 --- a/llvmlite/tests/test_refprune.py +++ b/llvmlite/tests/test_refprune.py @@ -456,7 +456,8 @@ def test_fanout_raise_1(self): def test_fanout_raise_2(self): mod, stats = self.check(self.fanout_raise_2) - self.assertEqual(stats.fanout_raise, 0) + # Change in behaviour: ignore bad metadata + self.assertEqual(stats.fanout_raise, 2) fanout_raise_3 = r""" define i32 @main(i8* %ptr, i1 %cond) { @@ -495,6 +496,28 @@ def test_fanout_raise_4(self): mod, stats = self.check(self.fanout_raise_4) self.assertEqual(stats.fanout_raise, 0) + fanout_raise_5 = r""" +define i32 @main(i8* %ptr, i1 %cond) { +bb_A: + call void @NRT_incref(i8* %ptr) + br i1 %cond, label %bb_B, label %bb_C +bb_B: + call void @NRT_decref(i8* %ptr) + br label %common.ret +bb_C: + br label %common.ret ; pretend we throw an exception +common.ret: + %common.ret.op = phi i32 [ 0, %bb_B ], [ 1, %bb_C ] + ret i32 %common.ret.op +} + +!0 = !{i32 1} +""" + + def test_fanout_raise_5(self): + mod, stats = self.check(self.fanout_raise_5) + self.assertEqual(stats.fanout_raise, 2) + if __name__ == '__main__': unittest.main()