diff --git a/libs/eigen/.clang-format b/libs/eigen/.clang-format new file mode 100644 index 0000000..28251c6 --- /dev/null +++ b/libs/eigen/.clang-format @@ -0,0 +1,12 @@ +--- +Language: Cpp +BasedOnStyle: Google +ColumnLimit: 120 +SortIncludes: false +AttributeMacros: +- EIGEN_STRONG_INLINE +- EIGEN_ALWAYS_INLINE +- EIGEN_DEVICE_FUNC +- EIGEN_DONT_INLINE +- EIGEN_DEPRECATED +- EIGEN_UNUSED diff --git a/libs/eigen/CMakeLists.txt b/libs/eigen/CMakeLists.txt index f3e69b8..a57caee 100644 --- a/libs/eigen/CMakeLists.txt +++ b/libs/eigen/CMakeLists.txt @@ -1,8 +1,35 @@ # cmake_minimum_require must be the first command of the file -cmake_minimum_required(VERSION 3.5.0) +cmake_minimum_required(VERSION 3.10.0) + +# NOTE Remove setting the policy once the minimum required CMake version is +# increased to at least 3.15. Retain enabling the export to package registry. +if (POLICY CMP0090) + # The export command does not populate package registry by default + cmake_policy (SET CMP0090 NEW) + + # Unless otherwise specified, always export to package registry to ensure + # backwards compatibility. + if (NOT DEFINED CMAKE_EXPORT_PACKAGE_REGISTRY) + set (CMAKE_EXPORT_PACKAGE_REGISTRY ON) + endif (NOT DEFINED CMAKE_EXPORT_PACKAGE_REGISTRY) +endif (POLICY CMP0090) project(Eigen3) +# Remove this block after bumping CMake to v3.21.0 +# PROJECT_IS_TOP_LEVEL is defined then by default +if(CMAKE_VERSION VERSION_LESS 3.21.0) + if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(PROJECT_IS_TOP_LEVEL TRUE) + else() + set(PROJECT_IS_TOP_LEVEL FALSE) + endif() +endif() + +set(CMAKE_CXX_STANDARD 14 CACHE STRING "Default C++ standard") +set(CMAKE_CXX_STANDARD_REQUIRED ON CACHE BOOL "Require C++ standard") +set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Allow C++ extensions") + # guard against in-source builds if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) @@ -23,7 +50,7 @@ endif() ############################################################################# -# retrieve version information # +# retrieve version information # ############################################################################# # automatically parse the version number @@ -61,10 +88,6 @@ include(CMakeDependentOption) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) - -option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF) - - macro(ei_add_cxx_compiler_flag FLAG) string(REGEX REPLACE "-" "" SFLAG1 ${FLAG}) string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1}) @@ -74,20 +97,6 @@ macro(ei_add_cxx_compiler_flag FLAG) endif() endmacro() -check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11) - -if(EIGEN_TEST_CXX11) - set(CMAKE_CXX_STANDARD 11) - set(CMAKE_CXX_EXTENSIONS OFF) - if(EIGEN_COMPILER_SUPPORT_CPP11) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - endif() -else() - #set(CMAKE_CXX_STANDARD 03) - #set(CMAKE_CXX_EXTENSIONS OFF) - ei_add_cxx_compiler_flag("-std=c++03") -endif() - # Determine if we should build shared libraries on this platform. get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS) @@ -100,6 +109,8 @@ find_package(StandardMathLibrary) set(EIGEN_TEST_CUSTOM_LINKER_FLAGS "" CACHE STRING "Additional linker flags when linking unit tests.") set(EIGEN_TEST_CUSTOM_CXX_FLAGS "" CACHE STRING "Additional compiler flags when compiling unit tests.") +# convert space separated argument into CMake lists for downstream consumption +separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND ${EIGEN_TEST_CUSTOM_CXX_FLAGS}) set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "") @@ -109,13 +120,11 @@ if(NOT STANDARD_MATH_LIBRARY_FOUND) "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.") else() - if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}") else() set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}") endif() - endif() if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) @@ -125,6 +134,7 @@ else() endif() option(EIGEN_BUILD_BTL "Build benchmark suite" OFF) +option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF) # Disable pkgconfig only for native Windows builds if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows) @@ -183,18 +193,6 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-wd981") # disable ICC's "operands are evaluated in unspecified order" remark ei_add_cxx_compiler_flag("-wd2304") # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor - - # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails - # Moreover we should not set both -strict-ansi and -ansi - check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI) - ei_add_cxx_compiler_flag("-Qunused-arguments") # disable clang warning: argument unused during compilation: '-ansi' - - if(COMPILER_SUPPORT_STRICTANSI) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi") - else() - ei_add_cxx_compiler_flag("-ansi") - endif() - if(ANDROID_NDK) ei_add_cxx_compiler_flag("-pie") ei_add_cxx_compiler_flag("-fPIE") @@ -253,20 +251,20 @@ if(NOT MSVC) option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma") - if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() message(STATUS "Enabling AVX512 in tests/examples") endif() option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) if(EIGEN_TEST_AVX512DQ) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq") - if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq -mfma") message(STATUS "Enabling AVX512DQ in tests/examples") endif() + + option(EIGEN_TEST_AVX512FP16 "Enable/Disable AVX512-FP16 in tests/examples" OFF) + if(EIGEN_TEST_AVX512FP16) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma -mavx512vl -mavx512fp16") + message(STATUS "Enabling AVX512-FP16 in tests/examples") + endif() option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF) if(EIGEN_TEST_F16C) @@ -369,11 +367,19 @@ else() endif() option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF) - if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) + option(EIGEN_TEST_AVX2 "Enable/Disable FMA/AVX2 in tests/examples" OFF) + if((EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) OR EIGEN_TEST_AVX2) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") message(STATUS "Enabling FMA/AVX2 in tests/examples") endif() + option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) + option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) + if(EIGEN_TEST_AVX512 OR EIGEN_TEST_AVX512DQ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512") + message(STATUS "Enabling AVX512 in tests/examples") + endif() + endif() option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF) @@ -416,7 +422,8 @@ if(EIGEN_TEST_NO_EXCEPTIONS) message(STATUS "Disabling exceptions in tests/examples") endif() -set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") +set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.") +set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code") include_directories(${CMAKE_CURRENT_SOURCE_DIR}) @@ -450,17 +457,6 @@ foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR) endif() endforeach() -# similar to set_target_properties but append the property instead of overwriting it -macro(ei_add_target_property target prop value) - - get_target_property(previous ${target} ${prop}) - # if the property wasn't previously set, ${previous} is now "previous-NOTFOUND" which cmake allows catching with plain if() - if(NOT previous) - set(previous "") - endif() - set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}") -endmacro() - install(FILES signature_of_eigen3_matrix_library DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel @@ -482,8 +478,9 @@ if(EIGEN_BUILD_DOC) endif() -option(BUILD_TESTING "Enable creation of Eigen tests." ON) -if(BUILD_TESTING) +cmake_dependent_option(BUILD_TESTING "Enable creation of tests." ON "PROJECT_IS_TOP_LEVEL" OFF) +option(EIGEN_BUILD_TESTING "Enable creation of Eigen tests." ${BUILD_TESTING}) +if(EIGEN_BUILD_TESTING) include(EigenConfigureTesting) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) @@ -495,6 +492,9 @@ if(BUILD_TESTING) add_subdirectory(failtest) endif() +include(CMakeDetermineFortranCompiler) +option(EIGEN_BUILD_BLAS "Toggles the building of the Eigen Blas library" ${CMAKE_Fortran_COMPILER}) +option(EIGEN_BUILD_LAPACK "Toggles the building of the included Eigen LAPACK library" ${CMAKE_Fortran_COMPILER}) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(blas) add_subdirectory(lapack) @@ -545,13 +545,30 @@ if(EIGEN_BUILD_BTL) add_subdirectory(bench/btl EXCLUDE_FROM_ALL) endif() -if(NOT WIN32) +find_package(CLANG_FORMAT 9 EXACT) +if(CLANG_FORMAT_FOUND) +set(FORMAT_SOURCES) +list(APPEND FORMAT_SUBDIRS blas bench demos "doc" Eigen include lapack scripts share unsupported test failtest) +foreach(DIR ${FORMAT_SUBDIRS}) + set(ABS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${DIR}) + file(GLOB_RECURSE ${DIR}_SOURCES ${ABS_DIR}/*.cc ${ABS_DIR}/*.h ${ABS_DIR}/*.cpp ${ABS_DIR}/*.hpp ${ABS_DIR}/*.c) + list(APPEND FORMAT_SOURCES ${${DIR}_SOURCES}) + endforeach() + file(GLOB FORMAT_SOURCES_WITHOUTENDING LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/Eigen/* ${CMAKE_CURRENT_SOURCE_DIR}/Eigen/CXX11/* ${CMAKE_CURRENT_SOURCE_DIR}/unsupported/Eigen/* ${CMAKE_CURRENT_SOURCE_DIR}/unsupported/Eigen/CXX11/*) + list(FILTER FORMAT_SOURCES_WITHOUTENDING EXCLUDE REGEX ".*.txt$") + list (APPEND FORMAT_SOURCES ${FORMAT_SOURCES_WITHOUTENDING}) + add_custom_target(format + COMMAND ${CLANG_FORMAT_EXECUTABLE} -i -style=file ${FORMAT_SOURCES} + DEPENDS ${FORMAT_SOURCES}) +endif() + +if(NOT WIN32 AND EIGEN_BUILD_SPBENCH) add_subdirectory(bench/spbench EXCLUDE_FROM_ALL) endif() configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY) -if(BUILD_TESTING) +if(EIGEN_BUILD_TESTING) ei_testing_print_summary() endif() @@ -559,49 +576,49 @@ message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") message(STATUS "") -string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) -if(cmake_generator_tolower MATCHES "makefile") - message(STATUS "Available targets (use: make TARGET):") -else() - message(STATUS "Available targets (use: cmake --build . --target TARGET):") +if(PROJECT_IS_TOP_LEVEL) + string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) + if(cmake_generator_tolower MATCHES "makefile") + message(STATUS "Available targets (use: make TARGET):") + else() + message(STATUS "Available targets (use: cmake --build . --target TARGET):") + endif() + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "Target | Description") + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "install | Install Eigen. Headers will be installed to:") + message(STATUS " | /") + message(STATUS " | Using the following values:") + message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") + message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") + message(STATUS " | Change the install location of Eigen headers using:") + message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") + message(STATUS " | Or:") + message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") + message(STATUS "doc | Generate the API documentation, requires Doxygen & LaTeX") + if(EIGEN_BUILD_TESTING) + message(STATUS "check | Build and run the unit-tests. Read this page:") + message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") + endif() + if(CLANG_FORMAT_FOUND) + message(STATUS "format | Formats the source code according to .clang-format file") + endif() + message(STATUS "blas | Build BLAS library (not the same thing as Eigen)") + message(STATUS "uninstall| Remove files installed by the install target") + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "") endif() -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "Target | Description") -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "install | Install Eigen. Headers will be installed to:") -message(STATUS " | /") -message(STATUS " | Using the following values:") -message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") -message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") -message(STATUS " | Change the install location of Eigen headers using:") -message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") -message(STATUS " | Or:") -message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") -message(STATUS "doc | Generate the API documentation, requires Doxygen & LaTeX") -if(BUILD_TESTING) - message(STATUS "check | Build and run the unit-tests. Read this page:") - message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") -endif() -message(STATUS "blas | Build BLAS library (not the same thing as Eigen)") -message(STATUS "uninstall| Remove files installed by the install target") -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "") - set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} ) set ( EIGEN_VERSION_MAJOR ${EIGEN_WORLD_VERSION} ) set ( EIGEN_VERSION_MINOR ${EIGEN_MAJOR_VERSION} ) set ( EIGEN_VERSION_PATCH ${EIGEN_MINOR_VERSION} ) -set ( EIGEN_DEFINITIONS "") -set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" ) -set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} ) include (CMakePackageConfigHelpers) # Imported target support add_library (eigen INTERFACE) add_library (Eigen3::Eigen ALIAS eigen) -target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS}) target_include_directories (eigen INTERFACE $ $ @@ -612,23 +629,35 @@ set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen) install (TARGETS eigen EXPORT Eigen3Targets) +option(EIGEN_BUILD_CMAKE_PACKAGE "Enables the creation of EigenConfig.cmake and related files" ON) +if(EIGEN_BUILD_CMAKE_PACKAGE) configure_package_config_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake - PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} + NO_SET_AND_CHECK_MACRO # Eigen does not provide legacy style defines NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components ) -# Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does -# not depend on architecture specific settings or libraries. More -# specifically, an Eigen3Config.cmake generated from a 64 bit target can be -# used for 32 bit targets as well (and vice versa). -set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) -unset (CMAKE_SIZEOF_VOID_P) -write_basic_package_version_file (Eigen3ConfigVersion.cmake - VERSION ${EIGEN_VERSION_NUMBER} - COMPATIBILITY SameMajorVersion) -set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P}) + +# NOTE Remove the first code path once the minimum required CMake version is +# bumped to 3.14 or above. +if (CMAKE_VERSION VERSION_LESS 3.14) + # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does + # not depend on architecture specific settings or libraries. More + # specifically, an Eigen3Config.cmake generated from a 64 bit target can be + # used for 32 bit targets as well (and vice versa). + set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) + unset (CMAKE_SIZEOF_VOID_P) + write_basic_package_version_file (Eigen3ConfigVersion.cmake + VERSION ${EIGEN_VERSION_NUMBER} + COMPATIBILITY SameMajorVersion) + set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P}) +else (CMAKE_VERSION VERSION_LESS 3.14) + write_basic_package_version_file (Eigen3ConfigVersion.cmake + VERSION ${EIGEN_VERSION_NUMBER} + COMPATIBILITY SameMajorVersion + ARCH_INDEPENDENT) +endif (CMAKE_VERSION VERSION_LESS 3.14) # The Eigen target will be located in the Eigen3 namespace. Other CMake # targets can refer to it using Eigen3::Eigen. @@ -639,14 +668,16 @@ export (PACKAGE Eigen3) install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake - ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake - DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} ) +install (FILES ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake + DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}) # Add uninstall target -add_custom_target ( uninstall - COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake) +if(NOT TARGET uninstall) + add_custom_target ( uninstall + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake) +endif() +endif() if (EIGEN_SPLIT_TESTSUITE) ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}") diff --git a/libs/eigen/COPYING.GPL b/libs/eigen/COPYING.GPL deleted file mode 100644 index 94a9ed0..0000000 --- a/libs/eigen/COPYING.GPL +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/libs/eigen/COPYING.MPL2 b/libs/eigen/COPYING.MPL2 index 14e2f77..ee6256c 100644 --- a/libs/eigen/COPYING.MPL2 +++ b/libs/eigen/COPYING.MPL2 @@ -357,7 +357,7 @@ Exhibit A - Source Code Form License Notice This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at http://mozilla.org/MPL/2.0/. + file, You can obtain one at https://mozilla.org/MPL/2.0/. If it is not possible or desirable to put the notice in a particular file, then You may include the notice in a location (such as a LICENSE diff --git a/libs/eigen/Eigen/AccelerateSupport b/libs/eigen/Eigen/AccelerateSupport new file mode 100644 index 0000000..8cee7ac --- /dev/null +++ b/libs/eigen/Eigen/AccelerateSupport @@ -0,0 +1,50 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ACCELERATESUPPORT_MODULE_H +#define EIGEN_ACCELERATESUPPORT_MODULE_H + +#include "SparseCore" + +#include "src/Core/util/DisableStupidWarnings.h" + +/** \ingroup Support_modules + * \defgroup AccelerateSupport_Module AccelerateSupport module + * + * This module provides an interface to the Apple Accelerate library. + * It provides the seven following main factorization classes: + * - class AccelerateLLT: a Cholesky (LL^T) factorization. + * - class AccelerateLDLT: the default LDL^T factorization. + * - class AccelerateLDLTUnpivoted: a Cholesky-like LDL^T factorization with only 1x1 pivots and no pivoting + * - class AccelerateLDLTSBK: an LDL^T factorization with Supernode Bunch-Kaufman and static pivoting + * - class AccelerateLDLTTPP: an LDL^T factorization with full threshold partial pivoting + * - class AccelerateQR: a QR factorization + * - class AccelerateCholeskyAtA: a QR factorization without storing Q (equivalent to A^TA = R^T R) + * + * \code + * #include + * \endcode + * + * In order to use this module, the Accelerate headers must be accessible from + * the include paths, and your binary must be linked to the Accelerate framework. + * The Accelerate library is only available on Apple hardware. + * + * Note that many of the algorithms can be influenced by the UpLo template + * argument. All matrices are assumed to be symmetric. For example, the following + * creates an LDLT factorization where your matrix is symmetric (implicit) and + * uses the lower triangle: + * + * \code + * AccelerateLDLT, Lower> ldlt; + * \endcode + */ + +#include "src/AccelerateSupport/AccelerateSupport.h" + +#include "src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_ACCELERATESUPPORT_MODULE_H diff --git a/libs/eigen/Eigen/Cholesky b/libs/eigen/Eigen/Cholesky index a318ceb..2c686f1 100644 --- a/libs/eigen/Eigen/Cholesky +++ b/libs/eigen/Eigen/Cholesky @@ -32,11 +32,7 @@ #include "src/Cholesky/LLT.h" #include "src/Cholesky/LDLT.h" #ifdef EIGEN_USE_LAPACKE -#ifdef EIGEN_USE_MKL -#include "mkl_lapacke.h" -#else -#include "src/misc/lapacke.h" -#endif +#include "src/misc/lapacke_helpers.h" #include "src/Cholesky/LLT_LAPACKE.h" #endif diff --git a/libs/eigen/Eigen/CholmodSupport b/libs/eigen/Eigen/CholmodSupport index bed8924..1037bd5 100644 --- a/libs/eigen/Eigen/CholmodSupport +++ b/libs/eigen/Eigen/CholmodSupport @@ -22,7 +22,7 @@ extern "C" { * This module provides an interface to the Cholmod library which is part of the suitesparse package. * It provides the two following main factorization classes: * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization. - * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). + * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). * * For the sake of completeness, this module also propose the two following classes: * - class CholmodSimplicialLLT diff --git a/libs/eigen/Eigen/Core b/libs/eigen/Eigen/Core index 5921e15..623d735 100644 --- a/libs/eigen/Eigen/Core +++ b/libs/eigen/Eigen/Core @@ -8,8 +8,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CORE_H -#define EIGEN_CORE_H +#ifndef EIGEN_CORE_MODULE_H +#define EIGEN_CORE_MODULE_H // first thing Eigen does: stop the compiler from reporting useless warnings. #include "src/Core/util/DisableStupidWarnings.h" @@ -36,7 +36,7 @@ // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3) // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details. -#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5) +#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_MOST(5,5) #pragma GCC optimize ("-fno-ipa-cp-clone") #endif @@ -67,6 +67,7 @@ #endif #ifdef EIGEN_HAS_OPENMP +#include #include #endif @@ -83,8 +84,8 @@ #include #include #include -#include #ifndef EIGEN_NO_IO + #include #include #endif #include @@ -94,14 +95,10 @@ // for min/max: #include -#if EIGEN_HAS_CXX11 #include -#endif // for std::is_nothrow_move_assignable -#ifdef EIGEN_INCLUDE_TYPE_TRAITS #include -#endif // for outputting debug info #ifdef EIGEN_DEBUG_ASSIGN @@ -109,7 +106,8 @@ #endif // required for __cpuid, needs to be included after cmath -#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE +// also required for _BitScanReverse on Windows on ARM +#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE #include #endif @@ -165,6 +163,7 @@ using std::ptrdiff_t; #include "src/Core/util/XprHelper.h" #include "src/Core/util/Memory.h" #include "src/Core/util/IntegralConstant.h" +#include "src/Core/util/Serializer.h" #include "src/Core/util/SymbolicIndex.h" #include "src/Core/NumTraits.h" @@ -179,6 +178,9 @@ using std::ptrdiff_t; #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" #if defined EIGEN_VECTORIZE_AVX512 + #if defined EIGEN_VECTORIZE_AVX512FP16 + #include "src/Core/arch/AVX512/PacketMathFP16.h" + #endif #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/Complex.h" @@ -191,6 +193,7 @@ using std::ptrdiff_t; #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h" + #include "src/Core/arch/AVX512/TrsmKernel.h" #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers #include "src/Core/arch/SSE/PacketMath.h" @@ -256,10 +259,14 @@ using std::ptrdiff_t; #include "src/Core/functors/StlFunctors.h" #include "src/Core/functors/AssignmentFunctors.h" -// Specialized functors to enable the processing of complex numbers -// on CUDA devices -#ifdef EIGEN_CUDACC -#include "src/Core/arch/CUDA/Complex.h" +// Specialized functors for GPU. +#ifdef EIGEN_GPUCC +#include "src/Core/arch/GPU/Complex.h" +#endif + +// Specializations of vectorized activation functions for NEON. +#ifdef EIGEN_VECTORIZE_NEON +#include "src/Core/arch/NEON/UnaryFunctors.h" #endif #include "src/Core/util/IndexedViewHelper.h" @@ -314,6 +321,7 @@ using std::ptrdiff_t; #include "src/Core/DiagonalMatrix.h" #include "src/Core/Diagonal.h" #include "src/Core/DiagonalProduct.h" +#include "src/Core/SkewSymmetricMatrix3.h" #include "src/Core/Redux.h" #include "src/Core/Visitor.h" #include "src/Core/Fuzzy.h" @@ -346,12 +354,16 @@ using std::ptrdiff_t; #include "src/Core/CoreIterators.h" #include "src/Core/ConditionEstimator.h" -#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/MatrixProduct.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" #endif +#if defined(EIGEN_VECTORIZE_AVX512) + #include "src/Core/arch/AVX512/GemmKernel.h" +#endif + #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" @@ -381,4 +393,4 @@ using std::ptrdiff_t; #include "src/Core/util/ReenableStupidWarnings.h" -#endif // EIGEN_CORE_H +#endif // EIGEN_CORE_MODULE_H diff --git a/libs/eigen/Eigen/IterativeLinearSolvers b/libs/eigen/Eigen/IterativeLinearSolvers index 957d575..26a0560 100644 --- a/libs/eigen/Eigen/IterativeLinearSolvers +++ b/libs/eigen/Eigen/IterativeLinearSolvers @@ -27,7 +27,7 @@ * - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices. * - IncompleteLUT - incomplete LU factorization with dual thresholding * - * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport. + * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport, AccelerateSupport. * \code #include diff --git a/libs/eigen/Eigen/LU b/libs/eigen/Eigen/LU index 1236ceb..b7f9a8a 100644 --- a/libs/eigen/Eigen/LU +++ b/libs/eigen/Eigen/LU @@ -28,11 +28,7 @@ #include "src/LU/FullPivLU.h" #include "src/LU/PartialPivLU.h" #ifdef EIGEN_USE_LAPACKE -#ifdef EIGEN_USE_MKL -#include "mkl_lapacke.h" -#else -#include "src/misc/lapacke.h" -#endif +#include "src/misc/lapacke_helpers.h" #include "src/LU/PartialPivLU_LAPACKE.h" #endif #include "src/LU/Determinant.h" diff --git a/libs/eigen/Eigen/QR b/libs/eigen/Eigen/QR index 8465b62..1f6c22e 100644 --- a/libs/eigen/Eigen/QR +++ b/libs/eigen/Eigen/QR @@ -36,11 +36,7 @@ #include "src/QR/ColPivHouseholderQR.h" #include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE -#ifdef EIGEN_USE_MKL -#include "mkl_lapacke.h" -#else -#include "src/misc/lapacke.h" -#endif +#include "src/misc/lapacke_helpers.h" #include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h" #endif diff --git a/libs/eigen/Eigen/SPQRSupport b/libs/eigen/Eigen/SPQRSupport index f70390c..33c3370 100644 --- a/libs/eigen/Eigen/SPQRSupport +++ b/libs/eigen/Eigen/SPQRSupport @@ -28,7 +28,7 @@ * */ -#include "src/CholmodSupport/CholmodSupport.h" +#include "Eigen/CholmodSupport" #include "src/SPQRSupport/SuiteSparseQRSupport.h" #endif diff --git a/libs/eigen/Eigen/SVD b/libs/eigen/Eigen/SVD index 3451794..8241c73 100644 --- a/libs/eigen/Eigen/SVD +++ b/libs/eigen/Eigen/SVD @@ -36,14 +36,17 @@ #include "src/SVD/SVDBase.h" #include "src/SVD/JacobiSVD.h" #include "src/SVD/BDCSVD.h" -#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) +#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_MKL #include "mkl_lapacke.h" #else #include "src/misc/lapacke.h" #endif +#ifndef EIGEN_USE_LAPACKE_STRICT #include "src/SVD/JacobiSVD_LAPACKE.h" #endif +#include "src/SVD/BDCSVD_LAPACKE.h" +#endif #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/libs/eigen/Eigen/SparseCore b/libs/eigen/Eigen/SparseCore index 76966c4..b2db46b 100644 --- a/libs/eigen/Eigen/SparseCore +++ b/libs/eigen/Eigen/SparseCore @@ -41,7 +41,6 @@ #include "src/SparseCore/SparseCompressedBase.h" #include "src/SparseCore/SparseMatrix.h" #include "src/SparseCore/SparseMap.h" -#include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" #include "src/SparseCore/SparseRef.h" #include "src/SparseCore/SparseCwiseUnaryOp.h" diff --git a/libs/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h b/libs/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h new file mode 100644 index 0000000..0417688 --- /dev/null +++ b/libs/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h @@ -0,0 +1,421 @@ +#ifndef EIGEN_ACCELERATESUPPORT_H +#define EIGEN_ACCELERATESUPPORT_H + +#include + +#include + +namespace Eigen { + +template +class AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \class AccelerateLLT + * \brief A direct Cholesky (LLT) factorization and solver based on Accelerate + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLLT + */ +template +using AccelerateLLT = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \class AccelerateLDLT + * \brief The default Cholesky (LDLT) factorization and solver based on Accelerate + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLT + */ +template +using AccelerateLDLT = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \class AccelerateLDLTUnpivoted + * \brief A direct Cholesky-like LDL^T factorization and solver based on Accelerate with only 1x1 pivots and no pivoting + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTUnpivoted + */ +template +using AccelerateLDLTUnpivoted = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \class AccelerateLDLTSBK + * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with Supernode Bunch-Kaufman and static pivoting + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTSBK + */ +template +using AccelerateLDLTSBK = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \class AccelerateLDLTTPP + * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with full threshold partial pivoting + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTTPP + */ +template +using AccelerateLDLTTPP = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \class AccelerateQR + * \brief A QR factorization and solver based on Accelerate + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateQR + */ +template +using AccelerateQR = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \class AccelerateCholeskyAtA + * \brief A QR factorization and solver based on Accelerate without storing Q (equivalent to A^TA = R^T R) + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateCholeskyAtA + */ +template +using AccelerateCholeskyAtA = AccelerateImpl; + +namespace internal { +template +struct AccelFactorizationDeleter { + void operator()(T* sym) { + if (sym) { + SparseCleanup(*sym); + delete sym; + sym = nullptr; + } + } +}; + +template +struct SparseTypesTraitBase { + typedef DenseVecT AccelDenseVector; + typedef DenseMatT AccelDenseMatrix; + typedef SparseMatT AccelSparseMatrix; + + typedef SparseOpaqueSymbolicFactorization SymbolicFactorization; + typedef NumFactT NumericFactorization; + + typedef AccelFactorizationDeleter SymbolicFactorizationDeleter; + typedef AccelFactorizationDeleter NumericFactorizationDeleter; +}; + +template +struct SparseTypesTrait {}; + +template <> +struct SparseTypesTrait : SparseTypesTraitBase {}; + +template <> +struct SparseTypesTrait + : SparseTypesTraitBase { +}; + +} // end namespace internal + +template +class AccelerateImpl : public SparseSolverBase > { + protected: + using Base = SparseSolverBase; + using Base::derived; + using Base::m_isInitialized; + + public: + using Base::_solve_impl; + + typedef MatrixType_ MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::StorageIndex StorageIndex; + enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic }; + enum { UpLo = UpLo_ }; + + using AccelDenseVector = typename internal::SparseTypesTrait::AccelDenseVector; + using AccelDenseMatrix = typename internal::SparseTypesTrait::AccelDenseMatrix; + using AccelSparseMatrix = typename internal::SparseTypesTrait::AccelSparseMatrix; + using SymbolicFactorization = typename internal::SparseTypesTrait::SymbolicFactorization; + using NumericFactorization = typename internal::SparseTypesTrait::NumericFactorization; + using SymbolicFactorizationDeleter = typename internal::SparseTypesTrait::SymbolicFactorizationDeleter; + using NumericFactorizationDeleter = typename internal::SparseTypesTrait::NumericFactorizationDeleter; + + AccelerateImpl() { + m_isInitialized = false; + + auto check_flag_set = [](int value, int flag) { return ((value & flag) == flag); }; + + if (check_flag_set(UpLo_, Symmetric)) { + m_sparseKind = SparseSymmetric; + m_triType = (UpLo_ & Lower) ? SparseLowerTriangle : SparseUpperTriangle; + } else if (check_flag_set(UpLo_, UnitLower)) { + m_sparseKind = SparseUnitTriangular; + m_triType = SparseLowerTriangle; + } else if (check_flag_set(UpLo_, UnitUpper)) { + m_sparseKind = SparseUnitTriangular; + m_triType = SparseUpperTriangle; + } else if (check_flag_set(UpLo_, StrictlyLower)) { + m_sparseKind = SparseTriangular; + m_triType = SparseLowerTriangle; + } else if (check_flag_set(UpLo_, StrictlyUpper)) { + m_sparseKind = SparseTriangular; + m_triType = SparseUpperTriangle; + } else if (check_flag_set(UpLo_, Lower)) { + m_sparseKind = SparseTriangular; + m_triType = SparseLowerTriangle; + } else if (check_flag_set(UpLo_, Upper)) { + m_sparseKind = SparseTriangular; + m_triType = SparseUpperTriangle; + } else { + m_sparseKind = SparseOrdinary; + m_triType = (UpLo_ & Lower) ? SparseLowerTriangle : SparseUpperTriangle; + } + + m_order = SparseOrderDefault; + } + + explicit AccelerateImpl(const MatrixType& matrix) : AccelerateImpl() { compute(matrix); } + + ~AccelerateImpl() {} + + inline Index cols() const { return m_nCols; } + inline Index rows() const { return m_nRows; } + + ComputationInfo info() const { + eigen_assert(m_isInitialized && "Decomposition is not initialized."); + return m_info; + } + + void analyzePattern(const MatrixType& matrix); + + void factorize(const MatrixType& matrix); + + void compute(const MatrixType& matrix); + + template + void _solve_impl(const MatrixBase& b, MatrixBase& dest) const; + + /** Sets the ordering algorithm to use. */ + void setOrder(SparseOrder_t order) { m_order = order; } + + private: + template + void buildAccelSparseMatrix(const SparseMatrix& a, AccelSparseMatrix& A, std::vector& columnStarts) { + const Index nColumnsStarts = a.cols() + 1; + + columnStarts.resize(nColumnsStarts); + + for (Index i = 0; i < nColumnsStarts; i++) columnStarts[i] = a.outerIndexPtr()[i]; + + SparseAttributes_t attributes{}; + attributes.transpose = false; + attributes.triangle = m_triType; + attributes.kind = m_sparseKind; + + SparseMatrixStructure structure{}; + structure.attributes = attributes; + structure.rowCount = static_cast(a.rows()); + structure.columnCount = static_cast(a.cols()); + structure.blockSize = 1; + structure.columnStarts = columnStarts.data(); + structure.rowIndices = const_cast(a.innerIndexPtr()); + + A.structure = structure; + A.data = const_cast(a.valuePtr()); + } + + void doAnalysis(AccelSparseMatrix& A) { + m_numericFactorization.reset(nullptr); + + SparseSymbolicFactorOptions opts{}; + opts.control = SparseDefaultControl; + opts.orderMethod = m_order; + opts.order = nullptr; + opts.ignoreRowsAndColumns = nullptr; + opts.malloc = malloc; + opts.free = free; + opts.reportError = nullptr; + + m_symbolicFactorization.reset(new SymbolicFactorization(SparseFactor(Solver_, A.structure, opts))); + + SparseStatus_t status = m_symbolicFactorization->status; + + updateInfoStatus(status); + + if (status != SparseStatusOK) m_symbolicFactorization.reset(nullptr); + } + + void doFactorization(AccelSparseMatrix& A) { + SparseStatus_t status = SparseStatusReleased; + + if (m_symbolicFactorization) { + m_numericFactorization.reset(new NumericFactorization(SparseFactor(*m_symbolicFactorization, A))); + + status = m_numericFactorization->status; + + if (status != SparseStatusOK) m_numericFactorization.reset(nullptr); + } + + updateInfoStatus(status); + } + + protected: + void updateInfoStatus(SparseStatus_t status) const { + switch (status) { + case SparseStatusOK: + m_info = Success; + break; + case SparseFactorizationFailed: + case SparseMatrixIsSingular: + m_info = NumericalIssue; + break; + case SparseInternalError: + case SparseParameterError: + case SparseStatusReleased: + default: + m_info = InvalidInput; + break; + } + } + + mutable ComputationInfo m_info; + Index m_nRows, m_nCols; + std::unique_ptr m_symbolicFactorization; + std::unique_ptr m_numericFactorization; + SparseKind_t m_sparseKind; + SparseTriangle_t m_triType; + SparseOrder_t m_order; +}; + +/** Computes the symbolic and numeric decomposition of matrix \a a */ +template +void AccelerateImpl::compute(const MatrixType& a) { + if (EnforceSquare_) eigen_assert(a.rows() == a.cols()); + + m_nRows = a.rows(); + m_nCols = a.cols(); + + AccelSparseMatrix A{}; + std::vector columnStarts; + + buildAccelSparseMatrix(a, A, columnStarts); + + doAnalysis(A); + + if (m_symbolicFactorization) doFactorization(A); + + m_isInitialized = true; +} + +/** Performs a symbolic decomposition on the sparsity pattern of matrix \a a. + * + * This function is particularly useful when solving for several problems having the same structure. + * + * \sa factorize() + */ +template +void AccelerateImpl::analyzePattern(const MatrixType& a) { + if (EnforceSquare_) eigen_assert(a.rows() == a.cols()); + + m_nRows = a.rows(); + m_nCols = a.cols(); + + AccelSparseMatrix A{}; + std::vector columnStarts; + + buildAccelSparseMatrix(a, A, columnStarts); + + doAnalysis(A); + + m_isInitialized = true; +} + +/** Performs a numeric decomposition of matrix \a a. + * + * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed. + * + * \sa analyzePattern() + */ +template +void AccelerateImpl::factorize(const MatrixType& a) { + eigen_assert(m_symbolicFactorization && "You must first call analyzePattern()"); + eigen_assert(m_nRows == a.rows() && m_nCols == a.cols()); + + if (EnforceSquare_) eigen_assert(a.rows() == a.cols()); + + AccelSparseMatrix A{}; + std::vector columnStarts; + + buildAccelSparseMatrix(a, A, columnStarts); + + doFactorization(A); +} + +template +template +void AccelerateImpl::_solve_impl(const MatrixBase& b, + MatrixBase& x) const { + if (!m_numericFactorization) { + m_info = InvalidInput; + return; + } + + eigen_assert(m_nRows == b.rows()); + eigen_assert(((b.cols() == 1) || b.outerStride() == b.rows())); + + SparseStatus_t status = SparseStatusOK; + + Scalar* b_ptr = const_cast(b.derived().data()); + Scalar* x_ptr = const_cast(x.derived().data()); + + AccelDenseMatrix xmat{}; + xmat.attributes = SparseAttributes_t(); + xmat.columnCount = static_cast(x.cols()); + xmat.rowCount = static_cast(x.rows()); + xmat.columnStride = xmat.rowCount; + xmat.data = x_ptr; + + AccelDenseMatrix bmat{}; + bmat.attributes = SparseAttributes_t(); + bmat.columnCount = static_cast(b.cols()); + bmat.rowCount = static_cast(b.rows()); + bmat.columnStride = bmat.rowCount; + bmat.data = b_ptr; + + SparseSolve(*m_numericFactorization, bmat, xmat); + + updateInfoStatus(status); +} + +} // end namespace Eigen + +#endif // EIGEN_ACCELERATESUPPORT_H diff --git a/libs/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h b/libs/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h new file mode 100644 index 0000000..69bcff5 --- /dev/null +++ b/libs/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_ACCELERATESUPPORT_MODULE_H +#error "Please include Eigen/AccelerateSupport instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h b/libs/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h new file mode 100644 index 0000000..5de2b21 --- /dev/null +++ b/libs/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_CHOLESKY_MODULE_H +#error "Please include Eigen/Cholesky instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/Cholesky/LDLT.h b/libs/eigen/Eigen/src/Cholesky/LDLT.h index 1013ca0..1d0369b 100644 --- a/libs/eigen/Eigen/src/Cholesky/LDLT.h +++ b/libs/eigen/Eigen/src/Cholesky/LDLT.h @@ -13,11 +13,13 @@ #ifndef EIGEN_LDLT_H #define EIGEN_LDLT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { - template struct traits > - : traits<_MatrixType> + template struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; @@ -37,8 +39,8 @@ namespace internal { * * \brief Robust Cholesky decomposition of a matrix with pivoting * - * \tparam _MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition - * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. + * \tparam MatrixType_ the type of the matrix of which to compute the LDL^T Cholesky decomposition + * \tparam UpLo_ the triangular part that will be used for the decomposition: Lower (default) or Upper. * The other triangular part won't be read. * * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite @@ -56,11 +58,11 @@ namespace internal { * * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ -template class LDLT - : public SolverBase > +template class LDLT + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; friend class SolverBase; @@ -68,7 +70,7 @@ template class LDLT enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - UpLo = _UpLo + UpLo = UpLo_ }; typedef Matrix TmpMatrixType; @@ -244,7 +246,7 @@ template class LDLT * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: * \code x = decomposition.adjoint().solve(b) \endcode */ - const LDLT& adjoint() const { return *this; }; + const LDLT& adjoint() const { return *this; } EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } @@ -270,10 +272,7 @@ template class LDLT protected: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) /** \internal * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U. @@ -441,7 +440,7 @@ template<> struct ldlt_inplace // Update the terms of L Index rs = size-j-1; w.tail(rs) -= wj * mat.col(j).tail(rs); - if(gamma != 0) + if(!numext::is_exactly_zero(gamma)) mat.col(j).tail(rs) += (sigma*numext::conj(wj)/gamma)*w.tail(rs); } return true; @@ -494,12 +493,10 @@ template struct LDLT_Traits /** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix */ -template +template template -LDLT& LDLT::compute(const EigenBase& a) +LDLT& LDLT::compute(const EigenBase& a) { - check_template_parameters(); - eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); @@ -510,7 +507,7 @@ LDLT& LDLT::compute(const EigenBase() + m_matrix.row(col).head(col).template lpNorm<1>(); else abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); @@ -534,9 +531,9 @@ LDLT& LDLT::compute(const EigenBase +template template -LDLT& LDLT::rankUpdate(const MatrixBase& w, const typename LDLT::RealScalar& sigma) +LDLT& LDLT::rankUpdate(const MatrixBase& w, const typename LDLT::RealScalar& sigma) { typedef typename TranspositionType::StorageIndex IndexType; const Index size = w.rows(); @@ -562,16 +559,16 @@ LDLT& LDLT::rankUpdate(const MatrixBase +template template -void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const +void LDLT::_solve_impl(const RhsType &rhs, DstType &dst) const { _solve_impl_transposed(rhs, dst); } -template +template template -void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +void LDLT::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const { // dst = P b dst = m_transpositions * rhs; @@ -624,9 +621,9 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType * * \sa LDLT::solve(), MatrixBase::ldlt() */ -template +template template -bool LDLT::solveInPlace(MatrixBase &bAndX) const +bool LDLT::solveInPlace(MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LDLT is not initialized."); eigen_assert(m_matrix.rows() == bAndX.rows()); @@ -639,8 +636,8 @@ bool LDLT::solveInPlace(MatrixBase &bAndX) const /** \returns the matrix represented by the decomposition, * i.e., it returns the product: P^T L D L^* P. * This function is provided for debug purpose. */ -template -MatrixType LDLT::reconstructedMatrix() const +template +MatrixType LDLT::reconstructedMatrix() const { eigen_assert(m_isInitialized && "LDLT is not initialized."); const Index size = m_matrix.rows(); diff --git a/libs/eigen/Eigen/src/Cholesky/LLT.h b/libs/eigen/Eigen/src/Cholesky/LLT.h index 8c9b2b3..1443eac 100644 --- a/libs/eigen/Eigen/src/Cholesky/LLT.h +++ b/libs/eigen/Eigen/src/Cholesky/LLT.h @@ -10,12 +10,14 @@ #ifndef EIGEN_LLT_H #define EIGEN_LLT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal{ -template struct traits > - : traits<_MatrixType> +template struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; @@ -32,8 +34,8 @@ template struct LLT_Traits; * * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features * - * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition - * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. + * \tparam MatrixType_ the type of the matrix of which we are computing the LL^T Cholesky decomposition + * \tparam UpLo_ the triangular part that will be used for the decomposition: Lower (default) or Upper. * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite @@ -58,16 +60,16 @@ template struct LLT_Traits; * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * - * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. + * Note that during the decomposition, only the lower (or upper, as defined by UpLo_) triangular part of A is considered. * Therefore, the strict lower part does not have to store correct values. * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ -template class LLT - : public SolverBase > +template class LLT + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; friend class SolverBase; @@ -79,7 +81,7 @@ template class LLT enum { PacketSize = internal::packet_traits::size, AlignmentMask = int(PacketSize)-1, - UpLo = _UpLo + UpLo = UpLo_ }; typedef internal::LLT_Traits Traits; @@ -199,7 +201,7 @@ template class LLT * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: * \code x = decomposition.adjoint().solve(b) \endcode */ - const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; }; + const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; } inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } @@ -217,10 +219,7 @@ template class LLT protected: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) /** \internal * Used to compute and store L @@ -243,7 +242,7 @@ static Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::ColXpr ColXpr; - typedef typename internal::remove_all::type ColXprCleaned; + typedef internal::remove_all_t ColXprCleaned; typedef typename ColXprCleaned::SegmentReturnType ColXprSegment; typedef Matrix TempVectorType; typedef typename TempVectorType::SegmentReturnType TempVecSegment; @@ -298,7 +297,7 @@ static Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const if(rs) { temp.tail(rs) -= (wj/Ljj) * mat.col(j).tail(rs); - if(gamma != 0) + if(!numext::is_exactly_zero(gamma)) mat.col(j).tail(rs) = (nLjj/Ljj) * mat.col(j).tail(rs) + (nLjj * sigma*numext::conj(wj)/gamma)*temp.tail(rs); } } @@ -427,12 +426,10 @@ template struct LLT_Traits * Example: \include TutorialLinAlgComputeTwice.cpp * Output: \verbinclude TutorialLinAlgComputeTwice.out */ -template +template template -LLT& LLT::compute(const EigenBase& a) +LLT& LLT::compute(const EigenBase& a) { - check_template_parameters(); - eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); @@ -444,7 +441,7 @@ LLT& LLT::compute(const EigenBase // TODO move this code to SelfAdjointView for (Index col = 0; col < size; ++col) { RealScalar abs_col_sum; - if (_UpLo == Lower) + if (UpLo_ == Lower) abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>(); else abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); @@ -464,9 +461,9 @@ LLT& LLT::compute(const EigenBase * then after it we have LL^* = A + sigma * v v^* where \a v must be a vector * of same dimension. */ -template +template template -LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) +LLT & LLT::rankUpdate(const VectorType& v, const RealScalar& sigma) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType); eigen_assert(v.size()==m_matrix.cols()); @@ -480,16 +477,16 @@ LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, } #ifndef EIGEN_PARSED_BY_DOXYGEN -template +template template -void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const +void LLT::_solve_impl(const RhsType &rhs, DstType &dst) const { _solve_impl_transposed(rhs, dst); } -template +template template -void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +void LLT::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const { dst = rhs; @@ -511,9 +508,9 @@ void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType * * \sa LLT::solve(), MatrixBase::llt() */ -template +template template -void LLT::solveInPlace(const MatrixBase &bAndX) const +void LLT::solveInPlace(const MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_matrix.rows()==bAndX.rows()); @@ -524,8 +521,8 @@ void LLT::solveInPlace(const MatrixBase &bAndX) const /** \returns the matrix represented by the decomposition, * i.e., it returns the product: L L^*. * This function is provided for debug purpose. */ -template -MatrixType LLT::reconstructedMatrix() const +template +MatrixType LLT::reconstructedMatrix() const { eigen_assert(m_isInitialized && "LLT is not initialized."); return matrixL() * matrixL().adjoint().toDenseMatrix(); diff --git a/libs/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h b/libs/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h index bc6489e..62bc679 100644 --- a/libs/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +++ b/libs/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h @@ -33,64 +33,86 @@ #ifndef EIGEN_LLT_LAPACKE_H #define EIGEN_LLT_LAPACKE_H -namespace Eigen { +#include "./InternalHeaderCheck.h" + +namespace Eigen { namespace internal { -template struct lapacke_llt; +namespace lapacke_helpers { + // ------------------------------------------------------------------------------------------------------------------- + // Dispatch for rank update handling upper and lower parts + // ------------------------------------------------------------------------------------------------------------------- -#define EIGEN_LAPACKE_LLT(EIGTYPE, BLASTYPE, LAPACKE_PREFIX) \ -template<> struct lapacke_llt \ -{ \ - template \ - static inline Index potrf(MatrixType& m, char uplo) \ - { \ - lapack_int matrix_order; \ - lapack_int size, lda, info, StorageOrder; \ - EIGTYPE* a; \ - eigen_assert(m.rows()==m.cols()); \ - /* Set up parameters for ?potrf */ \ - size = convert_index(m.rows()); \ - StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \ - matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \ - a = &(m.coeffRef(0,0)); \ - lda = convert_index(m.outerStride()); \ -\ - info = LAPACKE_##LAPACKE_PREFIX##potrf( matrix_order, uplo, size, (BLASTYPE*)a, lda ); \ - info = (info==0) ? -1 : info>0 ? info-1 : size; \ - return info; \ - } \ -}; \ -template<> struct llt_inplace \ -{ \ - template \ - static Index blocked(MatrixType& m) \ - { \ - return lapacke_llt::potrf(m, 'L'); \ - } \ - template \ - static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \ - { return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } \ -}; \ -template<> struct llt_inplace \ -{ \ - template \ - static Index blocked(MatrixType& m) \ - { \ - return lapacke_llt::potrf(m, 'U'); \ - } \ - template \ - static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \ - { \ - Transpose matt(mat); \ - return llt_inplace::rankUpdate(matt, vec.conjugate(), sigma); \ - } \ -}; + template + struct rank_update {}; -EIGEN_LAPACKE_LLT(double, double, d) -EIGEN_LAPACKE_LLT(float, float, s) -EIGEN_LAPACKE_LLT(dcomplex, lapack_complex_double, z) -EIGEN_LAPACKE_LLT(scomplex, lapack_complex_float, c) + template<> + struct rank_update { + template + static Index run(MatrixType &mat, const VectorType &vec, const typename MatrixType::RealScalar &sigma) { + return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); + } + }; + + template<> + struct rank_update { + template + static Index run(MatrixType &mat, const VectorType &vec, const typename MatrixType::RealScalar &sigma) { + Transpose matt(mat); + return Eigen::internal::llt_rank_update_lower(matt, vec.conjugate(), sigma); + } + }; + + // ------------------------------------------------------------------------------------------------------------------- + // Generic lapacke llt implementation that hands of to the dispatches + // ------------------------------------------------------------------------------------------------------------------- + + template + struct lapacke_llt { + template + static Index blocked(MatrixType& m) + { + eigen_assert(m.rows() == m.cols()); + if(m.rows() == 0) { + return -1; + } + /* Set up parameters for ?potrf */ + lapack_int size = to_lapack(m.rows()); + lapack_int matrix_order = lapack_storage_of(m); + Scalar* a = &(m.coeffRef(0,0)); + lapack_int lda = to_lapack(m.outerStride()); + + lapack_int info = potrf(matrix_order, translate_mode, size, to_lapack(a), lda ); + info = (info==0) ? -1 : info>0 ? info-1 : size; + return info; + } + + template + static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) + { + return rank_update::run(mat, vec, sigma); + } + }; +} +// end namespace lapacke_helpers + +/* + * Here, we just put the generic implementation from lapacke_llt into a full specialization of the llt_inplace + * type. By being a full specialization, the versions defined here thus get precedence over the generic implementation + * in LLT.h for double, float and complex double, complex float types. + */ + +#define EIGEN_LAPACKE_LLT(EIGTYPE) \ +template<> struct llt_inplace : public lapacke_helpers::lapacke_llt {}; \ +template<> struct llt_inplace : public lapacke_helpers::lapacke_llt {}; + +EIGEN_LAPACKE_LLT(double) +EIGEN_LAPACKE_LLT(float) +EIGEN_LAPACKE_LLT(std::complex) +EIGEN_LAPACKE_LLT(std::complex) + +#undef EIGEN_LAPACKE_LLT } // end namespace internal diff --git a/libs/eigen/Eigen/src/CholmodSupport/CholmodSupport.h b/libs/eigen/Eigen/src/CholmodSupport/CholmodSupport.h index adaf528..91c1cfc 100644 --- a/libs/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/libs/eigen/Eigen/src/CholmodSupport/CholmodSupport.h @@ -10,6 +10,8 @@ #ifndef EIGEN_CHOLMODSUPPORT_H #define EIGEN_CHOLMODSUPPORT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -54,8 +56,8 @@ template<> struct cholmod_configure_matrix > { /** Wraps the Eigen sparse matrix \a mat into a Cholmod sparse matrix object. * Note that the data are shared. */ -template -cholmod_sparse viewAsCholmod(Ref > mat) +template +cholmod_sparse viewAsCholmod(Ref > mat) { cholmod_sparse res; res.nzmax = mat.nonZeros(); @@ -80,11 +82,11 @@ cholmod_sparse viewAsCholmod(Ref > res.dtype = 0; res.stype = -1; - if (internal::is_same<_StorageIndex,int>::value) + if (internal::is_same::value) { res.itype = CHOLMOD_INT; } - else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value) + else if (internal::is_same::value) { res.itype = CHOLMOD_LONG; } @@ -94,39 +96,39 @@ cholmod_sparse viewAsCholmod(Ref > } // setup res.xtype - internal::cholmod_configure_matrix<_Scalar>::run(res); + internal::cholmod_configure_matrix::run(res); res.stype = 0; return res; } -template -const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat) +template +const cholmod_sparse viewAsCholmod(const SparseMatrix& mat) { - cholmod_sparse res = viewAsCholmod(Ref >(mat.const_cast_derived())); + cholmod_sparse res = viewAsCholmod(Ref >(mat.const_cast_derived())); return res; } -template -const cholmod_sparse viewAsCholmod(const SparseVector<_Scalar,_Options,_Index>& mat) +template +const cholmod_sparse viewAsCholmod(const SparseVector& mat) { - cholmod_sparse res = viewAsCholmod(Ref >(mat.const_cast_derived())); + cholmod_sparse res = viewAsCholmod(Ref >(mat.const_cast_derived())); return res; } /** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix. * The data are not copied but shared. */ -template -cholmod_sparse viewAsCholmod(const SparseSelfAdjointView, UpLo>& mat) +template +cholmod_sparse viewAsCholmod(const SparseSelfAdjointView, UpLo>& mat) { - cholmod_sparse res = viewAsCholmod(Ref >(mat.matrix().const_cast_derived())); + cholmod_sparse res = viewAsCholmod(Ref >(mat.matrix().const_cast_derived())); if(UpLo==Upper) res.stype = 1; if(UpLo==Lower) res.stype = -1; // swap stype for rowmajor matrices (only works for real matrices) - EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); - if(_Options & RowMajorBit) res.stype *=-1; + EIGEN_STATIC_ASSERT((Options_ & RowMajorBit) == 0 || NumTraits::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + if(Options_ & RowMajorBit) res.stype *=-1; return res; } @@ -155,9 +157,9 @@ cholmod_dense viewAsCholmod(MatrixBase& mat) /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix. * The data are not copied but shared. */ template -MappedSparseMatrix viewAsEigen(cholmod_sparse& cm) +Map > viewAsEigen(cholmod_sparse& cm) { - return MappedSparseMatrix + return Map > (cm.nrow, cm.ncol, static_cast(cm.p)[cm.ncol], static_cast(cm.p), static_cast(cm.i),static_cast(cm.x) ); } @@ -167,11 +169,11 @@ namespace internal { // template specializations for int and long that call the correct cholmod method #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \ - template inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ + template inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ template<> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \ - template inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ + template inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ template<> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } EIGEN_CHOLMOD_SPECIALIZE0(int, start) @@ -183,14 +185,14 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A) EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) -template inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } +template inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } template<> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } -template inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } +template inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } template<> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } -template -inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } +template +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], StorageIndex_* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } template<> inline int cm_factorize_p (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } @@ -210,7 +212,7 @@ enum CholmodMode { * \brief The base class for the direct Cholesky factorization of Cholmod * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT */ -template +template class CholmodBase : public SparseSolverBase { protected: @@ -218,8 +220,8 @@ class CholmodBase : public SparseSolverBase using Base::derived; using Base::m_isInitialized; public: - typedef _MatrixType MatrixType; - enum { UpLo = _UpLo }; + typedef MatrixType_ MatrixType; + enum { UpLo = UpLo_ }; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef MatrixType CholMatrixType; @@ -436,7 +438,7 @@ class CholmodBase : public SparseSolverBase if (m_cholmodFactor->is_ll) logDet *= 2.0; return logDet; - }; + } template void dumpMemory(Stream& /*s*/) @@ -461,8 +463,8 @@ class CholmodBase : public SparseSolverBase * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices * X and B can be either dense or sparse. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. * * \implsparsesolverconcept @@ -473,15 +475,15 @@ class CholmodBase : public SparseSolverBase * * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT */ -template -class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT<_MatrixType, _UpLo> > +template +class CholmodSimplicialLLT : public CholmodBase > { - typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base; + typedef CholmodBase Base; using Base::m_cholmod; public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; CholmodSimplicialLLT() : Base() { init(); } @@ -512,8 +514,8 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices * X and B can be either dense or sparse. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. * * \implsparsesolverconcept @@ -524,15 +526,15 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl * * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT */ -template -class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT<_MatrixType, _UpLo> > +template +class CholmodSimplicialLDLT : public CholmodBase > { - typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base; + typedef CholmodBase Base; using Base::m_cholmod; public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; CholmodSimplicialLDLT() : Base() { init(); } @@ -561,8 +563,8 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices * X and B can be either dense or sparse. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. * * \implsparsesolverconcept @@ -573,15 +575,15 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp * * \sa \ref TutorialSparseSolverConcept */ -template -class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT<_MatrixType, _UpLo> > +template +class CholmodSupernodalLLT : public CholmodBase > { - typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base; + typedef CholmodBase Base; using Base::m_cholmod; public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; CholmodSupernodalLLT() : Base() { init(); } @@ -612,8 +614,8 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper * On the other hand, it does not provide access to the result of the factorization. * The default is to let Cholmod automatically choose between a simplicial and supernodal factorization. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. * * \implsparsesolverconcept @@ -624,15 +626,15 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper * * \sa \ref TutorialSparseSolverConcept */ -template -class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecomposition<_MatrixType, _UpLo> > +template +class CholmodDecomposition : public CholmodBase > { - typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base; + typedef CholmodBase Base; using Base::m_cholmod; public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; CholmodDecomposition() : Base() { init(); } diff --git a/libs/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h b/libs/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h new file mode 100644 index 0000000..0fb3abc --- /dev/null +++ b/libs/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_CHOLMODSUPPORT_MODULE_H +#error "Please include Eigen/CholmodSupport instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/Core/ArithmeticSequence.h b/libs/eigen/Eigen/src/Core/ArithmeticSequence.h index b6200fa..81005c5 100644 --- a/libs/eigen/Eigen/src/Core/ArithmeticSequence.h +++ b/libs/eigen/Eigen/src/Core/ArithmeticSequence.h @@ -10,69 +10,18 @@ #ifndef EIGEN_ARITHMETIC_SEQUENCE_H #define EIGEN_ARITHMETIC_SEQUENCE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) -template struct aseq_negate {}; - -template<> struct aseq_negate { - typedef Index type; -}; - -template struct aseq_negate > { - typedef FixedInt<-N> type; -}; - -// Compilation error in the following case: -template<> struct aseq_negate > {}; - -template::value, - bool SizeIsSymbolic =symbolic::is_symbolic::value> -struct aseq_reverse_first_type { - typedef Index type; -}; - -template -struct aseq_reverse_first_type { - typedef symbolic::AddExpr > >, - symbolic::ValueExpr > - > type; -}; - -template -struct aseq_reverse_first_type_aux { - typedef Index type; -}; - -template -struct aseq_reverse_first_type_aux::type> { - typedef FixedInt<(SizeType::value-1)*IncrType::value> type; -}; - -template -struct aseq_reverse_first_type { - typedef typename aseq_reverse_first_type_aux::type Aux; - typedef symbolic::AddExpr > type; -}; - -template -struct aseq_reverse_first_type { - typedef symbolic::AddExpr > >, - symbolic::ValueExpr >, - symbolic::ValueExpr<> > type; -}; -#endif - // Helper to cleanup the type of the increment: template struct cleanup_seq_incr { typedef typename cleanup_index_type::type type; }; -} +} // namespace internal //-------------------------------------------------------------------------------- // seq(first,last,incr) and seqN(first,size,incr) @@ -137,21 +86,9 @@ protected: IncrType m_incr; public: - -#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) { return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); } -#else -protected: - typedef typename internal::aseq_negate::type ReverseIncrType; - typedef typename internal::aseq_reverse_first_type::type ReverseFirstType; -public: - ArithmeticSequence - reverse() const { - return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); - } -#endif }; /** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr @@ -200,7 +137,6 @@ auto seq(FirstType f, LastType l); #else // EIGEN_PARSED_BY_DOXYGEN -#if EIGEN_HAS_CXX11 template auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type::type(f), ( typename internal::cleanup_index_type::type(l) @@ -226,101 +162,11 @@ auto seq(FirstType f, LastType l, IncrType incr) CleanedIncrType(incr)); } -#else // EIGEN_HAS_CXX11 - -template -typename internal::enable_if::value || symbolic::is_symbolic::value), - ArithmeticSequence::type,Index> >::type -seq(FirstType f, LastType l) -{ - return seqN(typename internal::cleanup_index_type::type(f), - Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+fix<1>()))); -} - -template -typename internal::enable_if::value, - ArithmeticSequence,symbolic::ValueExpr<> >, - symbolic::ValueExpr > > > >::type -seq(const symbolic::BaseExpr &f, LastType l) -{ - return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+fix<1>())); -} - -template -typename internal::enable_if::value, - ArithmeticSequence::type, - symbolic::AddExpr >, - symbolic::ValueExpr > > > >::type -seq(FirstType f, const symbolic::BaseExpr &l) -{ - return seqN(typename internal::cleanup_index_type::type(f),(l.derived()-typename internal::cleanup_index_type::type(f)+fix<1>())); -} - -template -ArithmeticSequence >,symbolic::ValueExpr > > > -seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l) -{ - return seqN(f.derived(),(l.derived()-f.derived()+fix<1>())); -} - - -template -typename internal::enable_if::value || symbolic::is_symbolic::value), - ArithmeticSequence::type,Index,typename internal::cleanup_seq_incr::type> >::type -seq(FirstType f, LastType l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(typename internal::cleanup_index_type::type(f), - Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr); -} - -template -typename internal::enable_if::value, - ArithmeticSequence, - symbolic::ValueExpr<> >, - symbolic::ValueExpr::type> >, - symbolic::ValueExpr::type> >, - typename internal::cleanup_seq_incr::type> >::type -seq(const symbolic::BaseExpr &f, LastType l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); -} - -template -typename internal::enable_if::value, - ArithmeticSequence::type, - symbolic::QuotientExpr >, - symbolic::ValueExpr::type> >, - symbolic::ValueExpr::type> >, - typename internal::cleanup_seq_incr::type> >::type -seq(FirstType f, const symbolic::BaseExpr &l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(typename internal::cleanup_index_type::type(f), - (l.derived()-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr); -} - -template -ArithmeticSequence >, - symbolic::ValueExpr::type> >, - symbolic::ValueExpr::type> >, - typename internal::cleanup_seq_incr::type> -seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); -} -#endif // EIGEN_HAS_CXX11 #endif // EIGEN_PARSED_BY_DOXYGEN +namespace placeholders { -#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN) /** \cpp11 * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr. * @@ -329,9 +175,9 @@ seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr auto lastN(SizeType size, IncrType incr) --> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) +-> decltype(seqN(Eigen::placeholders::last-(size-fix<1>())*incr, size, incr)) { - return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); + return seqN(Eigen::placeholders::last-(size-fix<1>())*incr, size, incr); } /** \cpp11 @@ -342,18 +188,19 @@ auto lastN(SizeType size, IncrType incr) * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */ template auto lastN(SizeType size) --> decltype(seqN(Eigen::last+fix<1>()-size, size)) +-> decltype(seqN(Eigen::placeholders::last+fix<1>()-size, size)) { - return seqN(Eigen::last+fix<1>()-size, size); + return seqN(Eigen::placeholders::last+fix<1>()-size, size); } -#endif + +} // namespace placeholders namespace internal { // Convert a symbolic span into a usable one (i.e., remove last/end "keywords") template struct make_size_type { - typedef typename internal::conditional::value, Index, T>::type type; + typedef std::conditional_t::value, Index, T> type; }; template @@ -387,25 +234,23 @@ struct get_compile_time_incr > { * \code using namespace Eigen::indexing; \endcode * is equivalent to: * \code - using Eigen::all; + using Eigen::fix; using Eigen::seq; using Eigen::seqN; - using Eigen::lastN; // c++11 only - using Eigen::last; - using Eigen::lastp1; - using Eigen::fix; + using Eigen::placeholders::all; + using Eigen::placeholders::last; + using Eigen::placeholders::lastN; // c++11 only + using Eigen::placeholders::lastp1; \endcode */ namespace indexing { - using Eigen::all; + using Eigen::fix; using Eigen::seq; using Eigen::seqN; - #if EIGEN_HAS_CXX11 - using Eigen::lastN; - #endif - using Eigen::last; - using Eigen::lastp1; - using Eigen::fix; + using Eigen::placeholders::all; + using Eigen::placeholders::last; + using Eigen::placeholders::lastN; + using Eigen::placeholders::lastp1; } } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/Array.h b/libs/eigen/Eigen/src/Core/Array.h index 20c789b..d7a5e7a 100644 --- a/libs/eigen/Eigen/src/Core/Array.h +++ b/libs/eigen/Eigen/src/Core/Array.h @@ -10,14 +10,16 @@ #ifndef EIGEN_ARRAY_H #define EIGEN_ARRAY_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template -struct traits > : traits > +template +struct traits > : traits > { typedef ArrayXpr XprKind; - typedef ArrayBase > XprBase; + typedef ArrayBase > XprBase; }; } @@ -41,16 +43,16 @@ struct traits > : tra * * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy */ -template +template class Array - : public PlainObjectBase > + : public PlainObjectBase > { public: typedef PlainObjectBase Base; EIGEN_DENSE_PUBLIC_INTERFACE(Array) - enum { Options = _Options }; + enum { Options = Options_ }; typedef typename Base::PlainObject PlainObject; protected: @@ -131,7 +133,6 @@ class Array EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array() : Base() { - Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } @@ -142,17 +143,14 @@ class Array Array(internal::constructor_without_unaligned_array_assert) : Base(internal::constructor_without_unaligned_array_assert()) { - Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } #endif -#if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) : Base(std::move(other)) { - Base::_check_template_params(); } EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) @@ -160,9 +158,7 @@ class Array Base::operator=(std::move(other)); return *this; } -#endif - #if EIGEN_HAS_CXX11 /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) * * Example: \include Array_variadic_ctor_cxx11.cpp @@ -197,16 +193,15 @@ class Array * * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const std::initializer_list>& list) : Base(list) {} - #endif // end EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array( + const std::initializer_list>& list) + : Base(list) {} #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(const T& x) { - Base::_check_template_params(); Base::template _init1(x); } @@ -214,7 +209,6 @@ class Array EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1) { - Base::_check_template_params(); this->template _init2(val0, val1); } @@ -249,7 +243,6 @@ class Array EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) { - Base::_check_template_params(); EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 3) m_storage.data()[0] = val0; m_storage.data()[1] = val1; @@ -261,7 +254,6 @@ class Array EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) { - Base::_check_template_params(); EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 4) m_storage.data()[0] = val0; m_storage.data()[1] = val1; @@ -283,8 +275,8 @@ class Array template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const EigenBase &other, - typename internal::enable_if::value, - PrivateType>::type = PrivateType()) + std::enable_if_t::value, + PrivateType> = PrivateType()) : Base(other.derived()) { } @@ -359,8 +351,6 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS #undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS -#if EIGEN_HAS_CXX11 - #define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix) \ /** \ingroup arraytypedefs */ \ /** \brief \cpp11 */ \ @@ -392,8 +382,6 @@ EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4) #undef EIGEN_MAKE_ARRAY_TYPEDEFS #undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS -#endif // EIGEN_HAS_CXX11 - #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ using Eigen::Matrix##SizeSuffix##TypeSuffix; \ using Eigen::Vector##SizeSuffix##TypeSuffix; \ diff --git a/libs/eigen/Eigen/src/Core/ArrayBase.h b/libs/eigen/Eigen/src/Core/ArrayBase.h index ea3dd1c..28397e5 100644 --- a/libs/eigen/Eigen/src/Core/ArrayBase.h +++ b/libs/eigen/Eigen/src/Core/ArrayBase.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ARRAYBASE_H #define EIGEN_ARRAYBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template class MatrixWrapper; @@ -21,7 +23,7 @@ template class MatrixWrapper; * * An array is similar to a dense vector or matrix. While matrices are mathematical * objects with well defined linear algebra operators, an array is just a collection - * of scalar values arranged in a one or two dimensionnal fashion. As the main consequence, + * of scalar values arranged in a one or two dimensional fashion. As the main consequence, * all operations applied to an array are performed coefficient wise. Furthermore, * arrays support scalar math functions of the c++ standard library (e.g., std::sin(x)), and convenient * constructors allowing to easily write generic code working for both scalar values diff --git a/libs/eigen/Eigen/src/Core/ArrayWrapper.h b/libs/eigen/Eigen/src/Core/ArrayWrapper.h index 2e9555b..e65b8fb 100644 --- a/libs/eigen/Eigen/src/Core/ArrayWrapper.h +++ b/libs/eigen/Eigen/src/Core/ArrayWrapper.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ARRAYWRAPPER_H #define EIGEN_ARRAYWRAPPER_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class ArrayWrapper @@ -26,12 +28,12 @@ namespace Eigen { namespace internal { template struct traits > - : public traits::type > + : public traits > { typedef ArrayXpr XprKind; // Let's remove NestByRefBit enum { - Flags0 = traits::type >::Flags, + Flags0 = traits >::Flags, LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; @@ -45,13 +47,13 @@ class ArrayWrapper : public ArrayBase > typedef ArrayBase Base; EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper) - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; - typedef typename internal::conditional< + typedef std::conditional_t< internal::is_lvalue::value, Scalar, const Scalar - >::type ScalarWithConstIfNotLvalue; + > ScalarWithConstIfNotLvalue; typedef typename internal::ref_selector::non_const_type NestedExpressionType; @@ -91,7 +93,7 @@ class ArrayWrapper : public ArrayBase > inline void evalTo(Dest& dst) const { dst = m_expression; } EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& + const internal::remove_all_t& nestedExpression() const { return m_expression; @@ -124,12 +126,12 @@ class ArrayWrapper : public ArrayBase > namespace internal { template struct traits > - : public traits::type > + : public traits > { typedef MatrixXpr XprKind; // Let's remove NestByRefBit enum { - Flags0 = traits::type >::Flags, + Flags0 = traits >::Flags, LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; @@ -143,13 +145,13 @@ class MatrixWrapper : public MatrixBase > typedef MatrixBase > Base; EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper) - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; - typedef typename internal::conditional< - internal::is_lvalue::value, - Scalar, - const Scalar - >::type ScalarWithConstIfNotLvalue; + typedef std::conditional_t< + internal::is_lvalue::value, + Scalar, + const Scalar + > ScalarWithConstIfNotLvalue; typedef typename internal::ref_selector::non_const_type NestedExpressionType; @@ -185,7 +187,7 @@ class MatrixWrapper : public MatrixBase > } EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& + const internal::remove_all_t& nestedExpression() const { return m_expression; diff --git a/libs/eigen/Eigen/src/Core/Assign.h b/libs/eigen/Eigen/src/Core/Assign.h index 655412e..dc716d3 100644 --- a/libs/eigen/Eigen/src/Core/Assign.h +++ b/libs/eigen/Eigen/src/Core/Assign.h @@ -12,6 +12,8 @@ #ifndef EIGEN_ASSIGN_H #define EIGEN_ASSIGN_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template diff --git a/libs/eigen/Eigen/src/Core/AssignEvaluator.h b/libs/eigen/Eigen/src/Core/AssignEvaluator.h index 7d76f0c..8fb1f81 100644 --- a/libs/eigen/Eigen/src/Core/AssignEvaluator.h +++ b/libs/eigen/Eigen/src/Core/AssignEvaluator.h @@ -12,6 +12,8 @@ #ifndef EIGEN_ASSIGN_EVALUATOR_H #define EIGEN_ASSIGN_EVALUATOR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { // This implementation is based on Assign.h @@ -40,7 +42,7 @@ public: DstAlignment = DstEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment, DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit, - JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) + JointAlignment = plain_enum_min(DstAlignment, SrcAlignment) }; private: @@ -51,8 +53,8 @@ private: InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), - RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize), - RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize), + RestrictedInnerSize = min_size_prefer_fixed(InnerSize, MaxPacketSize), + RestrictedLinearSize = min_size_prefer_fixed(Dst::SizeAtCompileTime, MaxPacketSize), OuterStride = int(outer_stride_at_compile_time::ret), MaxSizeAtCompileTime = Dst::SizeAtCompileTime }; @@ -111,7 +113,7 @@ public: || int(Traversal) == SliceVectorizedTraversal }; - typedef typename conditional::type PacketType; + typedef std::conditional_t PacketType; private: enum { @@ -216,7 +218,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling template struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) { } }; template @@ -285,7 +287,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling template struct copy_using_evaluator_innervec_CompleteUnrolling { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) { } }; template @@ -325,10 +327,9 @@ struct dense_assignment_loop; template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/) + EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE EIGEN_CONSTEXPR run(Kernel& /*kernel*/) { - typedef typename Kernel::DstEvaluatorType::XprType DstXprType; - EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0, + EIGEN_STATIC_ASSERT(int(Kernel::DstEvaluatorType::XprType::SizeAtCompileTime) == 0, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT) } }; @@ -386,7 +387,7 @@ struct unaligned_dense_assignment_loop { // if IsAligned = true, then do nothing template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&, Index, Index) {} }; template <> @@ -402,7 +403,7 @@ struct unaligned_dense_assignment_loop Index end) #else template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel, Index start, Index end) #endif @@ -415,7 +416,7 @@ struct unaligned_dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel) { const Index size = kernel.size(); typedef typename Kernel::Scalar Scalar; @@ -443,7 +444,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; @@ -469,7 +470,7 @@ struct dense_assignment_loop SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, DstAlignment = Kernel::AssignmentTraits::DstAlignment }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel) { const Index innerSize = kernel.innerSize(); const Index outerSize = kernel.outerSize(); @@ -511,7 +512,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel) { const Index size = kernel.size(); for(Index i = 0; i < size; ++i) @@ -522,7 +523,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; copy_using_evaluator_LinearTraversal_CompleteUnrolling::run(kernel); @@ -536,7 +537,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel) { typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; @@ -584,7 +585,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; @@ -766,7 +767,7 @@ void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::a } template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func) { typedef evaluator DstEvaluatorType; typedef evaluator SrcEvaluatorType; @@ -844,8 +845,8 @@ void call_assignment(const Dst& dst, const Src& src) // Deal with "assume-aliasing" template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR +void call_assignment(Dst& dst, const Src& src, const Func& func, std::enable_if_t< evaluator_assume_aliasing::value, void*> = 0) { typename plain_matrix_type::type tmp(src); call_assignment_no_alias(dst, tmp, func); @@ -853,7 +854,7 @@ void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) +void call_assignment(Dst& dst, const Src& src, const Func& func, std::enable_if_t::value, void*> = 0) { call_assignment_no_alias(dst, src, func); } @@ -861,7 +862,7 @@ void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable // by-pass "assume-aliasing" // When there is no aliasing, we require that 'dst' has been properly resized template class StorageBase, typename Src, typename Func> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment(NoAlias& dst, const Src& src, const Func& func) { call_assignment_no_alias(dst.expression(), src, func); @@ -869,7 +870,7 @@ void call_assignment(NoAlias& dst, const Src& src, const Func& template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) { enum { @@ -878,8 +879,8 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) ) && int(Dst::SizeAtCompileTime) != 1 }; - typedef typename internal::conditional, Dst>::type ActualDstTypeCleaned; - typedef typename internal::conditional, Dst&>::type ActualDstType; + typedef std::conditional_t, Dst> ActualDstTypeCleaned; + typedef std::conditional_t, Dst&> ActualDstType; ActualDstType actualDst(dst); // TODO check whether this is the right place to perform these checks: @@ -911,14 +912,14 @@ void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const } template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias(Dst& dst, const Src& src) { call_assignment_no_alias(dst, src, internal::assign_op()); } template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) { // TODO check whether this is the right place to perform these checks: @@ -929,7 +930,7 @@ void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& Assignment::run(dst, src, func); } template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) { call_assignment_no_alias_no_transpose(dst, src, internal::assign_op()); diff --git a/libs/eigen/Eigen/src/Core/Assign_MKL.h b/libs/eigen/Eigen/src/Core/Assign_MKL.h old mode 100755 new mode 100644 index c6140d1..f9b86c8 --- a/libs/eigen/Eigen/src/Core/Assign_MKL.h +++ b/libs/eigen/Eigen/src/Core/Assign_MKL.h @@ -34,6 +34,8 @@ #ifndef EIGEN_ASSIGN_VML_H #define EIGEN_ASSIGN_VML_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -82,7 +84,7 @@ class vml_assign_traits #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE) \ template< typename DstXprType, typename SrcXprNested> \ struct Assignment, SrcXprNested>, assign_op, \ - Dense2Dense, typename enable_if::EnableVml>::type> { \ + Dense2Dense, std::enable_if_t::EnableVml>> { \ typedef CwiseUnaryOp, SrcXprNested> SrcXprType; \ static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ resize_if_allowed(dst, src, func); \ @@ -142,7 +144,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) template< typename DstXprType, typename SrcXprNested, typename Plain> \ struct Assignment, SrcXprNested, \ const CwiseNullaryOp,Plain> >, assign_op, \ - Dense2Dense, typename enable_if::EnableVml>::type> { \ + Dense2Dense, std::enable_if_t::EnableVml>> { \ typedef CwiseBinaryOp, SrcXprNested, \ const CwiseNullaryOp,Plain> > SrcXprType; \ static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ diff --git a/libs/eigen/Eigen/src/Core/BandMatrix.h b/libs/eigen/Eigen/src/Core/BandMatrix.h index 878c024..dcb0d13 100644 --- a/libs/eigen/Eigen/src/Core/BandMatrix.h +++ b/libs/eigen/Eigen/src/Core/BandMatrix.h @@ -10,6 +10,8 @@ #ifndef EIGEN_BANDMATRIX_H #define EIGEN_BANDMATRIX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -41,7 +43,7 @@ class BandMatrixBase : public EigenBase DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic, - SizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime) + SizeAtCompileTime = min_size_prefer_dynamic(RowsAtCompileTime,ColsAtCompileTime) }; public: @@ -96,13 +98,13 @@ class BandMatrixBase : public EigenBase DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic) ? Dynamic : (ActualIndex<0 - ? EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime, RowsAtCompileTime + ActualIndex) - : EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime - ActualIndex)) + ? min_size_prefer_dynamic(ColsAtCompileTime, RowsAtCompileTime + ActualIndex) + : min_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime - ActualIndex)) }; typedef Block BuildType; - typedef typename internal::conditional,BuildType >, - BuildType>::type Type; + BuildType> Type; }; /** \returns a vector expression of the \a N -th sub or super diagonal */ @@ -161,12 +163,12 @@ class BandMatrixBase : public EigenBase * * \brief Represents a rectangular matrix with a banded storage * - * \tparam _Scalar Numeric type, i.e. float, double, int - * \tparam _Rows Number of rows, or \b Dynamic - * \tparam _Cols Number of columns, or \b Dynamic - * \tparam _Supers Number of super diagonal - * \tparam _Subs Number of sub diagonal - * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint + * \tparam Scalar_ Numeric type, i.e. float, double, int + * \tparam Rows_ Number of rows, or \b Dynamic + * \tparam Cols_ Number of columns, or \b Dynamic + * \tparam Supers_ Number of super diagonal + * \tparam Subs_ Number of sub diagonal + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint * The former controls \ref TopicStorageOrders "storage order", and defaults to * column-major. The latter controls whether the matrix represents a selfadjoint * matrix in which case either Supers of Subs have to be null. @@ -174,29 +176,29 @@ class BandMatrixBase : public EigenBase * \sa class TridiagonalMatrix */ -template -struct traits > +template +struct traits > { - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Dense StorageKind; typedef Eigen::Index StorageIndex; enum { CoeffReadCost = NumTraits::ReadCost, - RowsAtCompileTime = _Rows, - ColsAtCompileTime = _Cols, - MaxRowsAtCompileTime = _Rows, - MaxColsAtCompileTime = _Cols, + RowsAtCompileTime = Rows_, + ColsAtCompileTime = Cols_, + MaxRowsAtCompileTime = Rows_, + MaxColsAtCompileTime = Cols_, Flags = LvalueBit, - Supers = _Supers, - Subs = _Subs, - Options = _Options, + Supers = Supers_, + Subs = Subs_, + Options = Options_, DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic }; typedef Matrix CoefficientsType; }; -template -class BandMatrix : public BandMatrixBase > +template +class BandMatrix : public BandMatrixBase > { public: @@ -233,32 +235,32 @@ class BandMatrix : public BandMatrixBase m_subs; }; -template +template class BandMatrixWrapper; -template -struct traits > +template +struct traits > { - typedef typename _CoefficientsType::Scalar Scalar; - typedef typename _CoefficientsType::StorageKind StorageKind; - typedef typename _CoefficientsType::StorageIndex StorageIndex; + typedef typename CoefficientsType_::Scalar Scalar; + typedef typename CoefficientsType_::StorageKind StorageKind; + typedef typename CoefficientsType_::StorageIndex StorageIndex; enum { - CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost, - RowsAtCompileTime = _Rows, - ColsAtCompileTime = _Cols, - MaxRowsAtCompileTime = _Rows, - MaxColsAtCompileTime = _Cols, + CoeffReadCost = internal::traits::CoeffReadCost, + RowsAtCompileTime = Rows_, + ColsAtCompileTime = Cols_, + MaxRowsAtCompileTime = Rows_, + MaxColsAtCompileTime = Cols_, Flags = LvalueBit, - Supers = _Supers, - Subs = _Subs, - Options = _Options, + Supers = Supers_, + Subs = Subs_, + Options = Options_, DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic }; - typedef _CoefficientsType CoefficientsType; + typedef CoefficientsType_ CoefficientsType; }; -template -class BandMatrixWrapper : public BandMatrixBase > +template +class BandMatrixWrapper : public BandMatrixBase > { public: @@ -266,12 +268,12 @@ class BandMatrixWrapper : public BandMatrixBase::CoefficientsType CoefficientsType; typedef typename internal::traits::StorageIndex StorageIndex; - explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs) + explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=Rows_, Index cols=Cols_, Index supers=Supers_, Index subs=Subs_) : m_coeffs(coeffs), m_rows(rows), m_supers(supers), m_subs(subs) { EIGEN_UNUSED_VARIABLE(cols); - //internal::assert(coeffs.cols()==cols() && (supers()+subs()+1)==coeffs.rows()); + // eigen_assert(coeffs.cols()==cols() && (supers()+subs()+1)==coeffs.rows()); } /** \returns the number of columns */ @@ -291,9 +293,9 @@ class BandMatrixWrapper : public BandMatrixBase m_rows; - internal::variable_if_dynamic m_supers; - internal::variable_if_dynamic m_subs; + internal::variable_if_dynamic m_rows; + internal::variable_if_dynamic m_supers; + internal::variable_if_dynamic m_subs; }; /** @@ -330,16 +332,16 @@ class TridiagonalMatrix : public BandMatrix -struct evaluator_traits > - : public evaluator_traits_base > +template +struct evaluator_traits > + : public evaluator_traits_base > { typedef BandShape Shape; }; -template -struct evaluator_traits > - : public evaluator_traits_base > +template +struct evaluator_traits > + : public evaluator_traits_base > { typedef BandShape Shape; }; diff --git a/libs/eigen/Eigen/src/Core/Block.h b/libs/eigen/Eigen/src/Core/Block.h index 3206d66..19c4b68 100644 --- a/libs/eigen/Eigen/src/Core/Block.h +++ b/libs/eigen/Eigen/src/Core/Block.h @@ -11,6 +11,8 @@ #ifndef EIGEN_BLOCK_H #define EIGEN_BLOCK_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -21,7 +23,7 @@ struct traits > : traits::StorageKind StorageKind; typedef typename traits::XprKind XprKind; typedef typename ref_selector::type XprTypeNested; - typedef typename remove_reference::type _XprTypeNested; + typedef std::remove_reference_t XprTypeNested_; enum{ MatrixRows = traits::RowsAtCompileTime, MatrixCols = traits::ColsAtCompileTime, @@ -110,7 +112,7 @@ template class EIGEN_GENERIC_PUBLIC_INTERFACE(Block) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; /** Column or Row constructor */ @@ -260,19 +262,19 @@ template - inline PacketScalar packet(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const { return m_xpr.template packet(rowId + m_startRow.value(), colId + m_startCol.value()); } template - inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { m_xpr.template writePacket(rowId + m_startRow.value(), colId + m_startCol.value(), val); } template - inline PacketScalar packet(Index index) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const { return m_xpr.template packet (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), @@ -280,7 +282,7 @@ template - inline void writePacket(Index index, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val) { m_xpr.template writePacket (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), @@ -295,7 +297,7 @@ template::type& nestedExpression() const + const internal::remove_all_t& nestedExpression() const { return m_xpr; } @@ -378,7 +380,7 @@ class BlockImpl_dense } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& nestedExpression() const EIGEN_NOEXCEPT + const internal::remove_all_t& nestedExpression() const EIGEN_NOEXCEPT { return m_xpr; } diff --git a/libs/eigen/Eigen/src/Core/BooleanRedux.h b/libs/eigen/Eigen/src/Core/BooleanRedux.h index 852de8b..20e5bd9 100644 --- a/libs/eigen/Eigen/src/Core/BooleanRedux.h +++ b/libs/eigen/Eigen/src/Core/BooleanRedux.h @@ -10,58 +10,62 @@ #ifndef EIGEN_ALLANDANY_H #define EIGEN_ALLANDANY_H -namespace Eigen { +#include "./InternalHeaderCheck.h" + +namespace Eigen { namespace internal { -template +template struct all_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; - + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; @@ -81,16 +85,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::RowsAtCompileTime>::run(evaluator); + return internal::all_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (!evaluator.coeff(i, j)) return false; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false; return true; } } @@ -105,16 +109,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::RowsAtCompileTime>::run(evaluator); + return internal::any_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (evaluator.coeff(i, j)) return true; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true; return false; } } @@ -134,7 +138,7 @@ EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const * \sa allFinite() */ template -inline bool DenseBase::hasNaN() const +EIGEN_DEVICE_FUNC inline bool DenseBase::hasNaN() const { #if EIGEN_COMP_MSVC || (defined __FAST_MATH__) return derived().array().isNaN().any(); @@ -148,7 +152,7 @@ inline bool DenseBase::hasNaN() const * \sa hasNaN() */ template -inline bool DenseBase::allFinite() const +EIGEN_DEVICE_FUNC inline bool DenseBase::allFinite() const { #if EIGEN_COMP_MSVC || (defined __FAST_MATH__) return derived().array().isFinite().all(); @@ -156,7 +160,7 @@ inline bool DenseBase::allFinite() const return !((derived()-derived()).hasNaN()); #endif } - + } // end namespace Eigen #endif // EIGEN_ALLANDANY_H diff --git a/libs/eigen/Eigen/src/Core/CommaInitializer.h b/libs/eigen/Eigen/src/Core/CommaInitializer.h index c0e29c7..7c2eea8 100644 --- a/libs/eigen/Eigen/src/Core/CommaInitializer.h +++ b/libs/eigen/Eigen/src/Core/CommaInitializer.h @@ -11,6 +11,8 @@ #ifndef EIGEN_COMMAINITIALIZER_H #define EIGEN_COMMAINITIALIZER_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class CommaInitializer @@ -45,7 +47,7 @@ struct CommaInitializer { eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols() && "Cannot comma-initialize a 0x0 matrix (operator<<)"); - m_xpr.block(0, 0, other.rows(), other.cols()) = other; + m_xpr.template block(0, 0, other.rows(), other.cols()) = other; } /* Copy/Move constructor which transfers ownership. This is crucial in diff --git a/libs/eigen/Eigen/src/Core/ConditionEstimator.h b/libs/eigen/Eigen/src/Core/ConditionEstimator.h index 51a2e5f..694be8b 100644 --- a/libs/eigen/Eigen/src/Core/ConditionEstimator.h +++ b/libs/eigen/Eigen/src/Core/ConditionEstimator.h @@ -10,6 +10,8 @@ #ifndef EIGEN_CONDITIONESTIMATOR_H #define EIGEN_CONDITIONESTIMATOR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -160,12 +162,12 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) return NumTraits::infinity(); - if (matrix_norm == RealScalar(0)) return RealScalar(0); - if (dec.rows() == 1) return RealScalar(1); + if (dec.rows() == 0) return NumTraits::infinity(); + if (numext::is_exactly_zero(matrix_norm)) return RealScalar(0); + if (dec.rows() == 1) return RealScalar(1); const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); - return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0) - : (RealScalar(1) / inverse_matrix_norm) / matrix_norm); + return (numext::is_exactly_zero(inverse_matrix_norm) ? RealScalar(0) + : (RealScalar(1) / inverse_matrix_norm) / matrix_norm); } } // namespace internal diff --git a/libs/eigen/Eigen/src/Core/CoreEvaluators.h b/libs/eigen/Eigen/src/Core/CoreEvaluators.h index 0ff8c8d..1729507 100644 --- a/libs/eigen/Eigen/src/Core/CoreEvaluators.h +++ b/libs/eigen/Eigen/src/Core/CoreEvaluators.h @@ -13,6 +13,8 @@ #ifndef EIGEN_COREEVALUATORS_H #define EIGEN_COREEVALUATORS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -498,7 +500,7 @@ struct evaluator > : evaluator_base > { typedef CwiseNullaryOp XprType; - typedef typename internal::remove_all::type PlainObjectTypeCleaned; + typedef internal::remove_all_t PlainObjectTypeCleaned; enum { CoeffReadCost = internal::functor_traits::Cost, @@ -655,9 +657,9 @@ struct ternary_evaluator, IndexBased ) ), Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit), - Alignment = EIGEN_PLAIN_ENUM_MIN( - EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment), - evaluator::Alignment) + Alignment = plain_enum_min( + plain_enum_min(evaluator::Alignment, evaluator::Alignment), + evaluator::Alignment) }; EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) @@ -751,7 +753,7 @@ struct binary_evaluator, IndexBased, IndexBase ) ), Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit), - Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment,evaluator::Alignment) + Alignment = plain_enum_min(evaluator::Alignment, evaluator::Alignment) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -810,11 +812,11 @@ protected: // -------------------- CwiseUnaryView -------------------- -template -struct unary_evaluator, IndexBased> - : evaluator_base > +template +struct unary_evaluator, IndexBased> + : evaluator_base > { - typedef CwiseUnaryView XprType; + typedef CwiseUnaryView XprType; enum { CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), @@ -900,7 +902,8 @@ struct mapbase_evaluator : evaluator_base m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) { - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator::Flags&PacketAccessBit, internal::inner_stride_at_compile_time::ret==1), + EIGEN_STATIC_ASSERT(check_implication((evaluator::Flags & PacketAccessBit) != 0, + internal::inner_stride_at_compile_time::ret == 1), PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1072,7 +1075,7 @@ struct evaluator > Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (OuterStrideAtCompileTime!=0) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, - Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) + Alignment = plain_enum_min(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1222,8 +1225,8 @@ struct block_evaluator(block) { - // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime - eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); + eigen_internal_assert((internal::is_constant_evaluated() || (internal::UIntPtr(block.data()) % plain_enum_max(1,evaluator::Alignment)) == 0) \ + && "data is not aligned"); } }; @@ -1239,12 +1242,12 @@ struct evaluator > typedef Select XprType; enum { CoeffReadCost = evaluator::CoeffReadCost - + EIGEN_PLAIN_ENUM_MAX(evaluator::CoeffReadCost, - evaluator::CoeffReadCost), + + plain_enum_max(evaluator::CoeffReadCost, + evaluator::CoeffReadCost), Flags = (unsigned int)evaluator::Flags & evaluator::Flags & HereditaryBits, - Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) + Alignment = plain_enum_min(evaluator::Alignment, evaluator::Alignment) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1295,7 +1298,7 @@ struct unary_evaluator > Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor }; typedef typename internal::nested_eval::type ArgTypeNested; - typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef internal::remove_all_t ArgTypeNestedCleaned; enum { CoeffReadCost = evaluator::CoeffReadCost, @@ -1379,7 +1382,7 @@ template struct evaluator_wrapper_base : evaluator_base { - typedef typename remove_all::type ArgType; + typedef remove_all_t ArgType; enum { CoeffReadCost = evaluator::CoeffReadCost, Flags = evaluator::Flags, @@ -1720,14 +1723,14 @@ struct evaluator > EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); } // This constructor is used when nesting an EvalTo evaluator in another evaluator EIGEN_DEVICE_FUNC evaluator(const ArgType& arg) : m_result(arg) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); } protected: diff --git a/libs/eigen/Eigen/src/Core/CoreIterators.h b/libs/eigen/Eigen/src/Core/CoreIterators.h index b967196..f74568a 100644 --- a/libs/eigen/Eigen/src/Core/CoreIterators.h +++ b/libs/eigen/Eigen/src/Core/CoreIterators.h @@ -10,6 +10,8 @@ #ifndef EIGEN_COREITERATORS_H #define EIGEN_COREITERATORS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core diff --git a/libs/eigen/Eigen/src/Core/CwiseBinaryOp.h b/libs/eigen/Eigen/src/Core/CwiseBinaryOp.h index 2202b1c..21a061a 100644 --- a/libs/eigen/Eigen/src/Core/CwiseBinaryOp.h +++ b/libs/eigen/Eigen/src/Core/CwiseBinaryOp.h @@ -11,6 +11,8 @@ #ifndef EIGEN_CWISE_BINARY_OP_H #define EIGEN_CWISE_BINARY_OP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -19,7 +21,7 @@ struct traits > { // we must not inherit from traits since it has // the potential to cause problems with MSVC - typedef typename remove_all::type Ancestor; + typedef remove_all_t Ancestor; typedef typename traits::XprKind XprKind; enum { RowsAtCompileTime = traits::RowsAtCompileTime, @@ -43,10 +45,10 @@ struct traits > typename traits::StorageIndex>::type StorageIndex; typedef typename Lhs::Nested LhsNested; typedef typename Rhs::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; enum { - Flags = cwise_promote_storage_order::StorageKind,typename traits::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value + Flags = cwise_promote_storage_order::StorageKind,typename traits::StorageKind,LhsNested_::Flags & RowMajorBit,RhsNested_::Flags & RowMajorBit>::value }; }; } // end namespace internal @@ -84,9 +86,9 @@ class CwiseBinaryOp : { public: - typedef typename internal::remove_all::type Functor; - typedef typename internal::remove_all::type Lhs; - typedef typename internal::remove_all::type Rhs; + typedef internal::remove_all_t Functor; + typedef internal::remove_all_t Lhs; + typedef internal::remove_all_t Rhs; typedef typename CwiseBinaryOpImpl< BinaryOp, LhsType, RhsType, @@ -95,12 +97,15 @@ class CwiseBinaryOp : BinaryOp>::ret>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp) + EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar) + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs) + typedef typename internal::ref_selector::type LhsNested; typedef typename internal::ref_selector::type RhsNested; - typedef typename internal::remove_reference::type _LhsNested; - typedef typename internal::remove_reference::type _RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; -#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 +#if EIGEN_COMP_MSVC //Required for Visual Studio or the Copy constructor will probably not get inlined! EIGEN_STRONG_INLINE CwiseBinaryOp(const CwiseBinaryOp&) = default; @@ -110,29 +115,26 @@ class CwiseBinaryOp : CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) { - EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); - // require the sizes to match - EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs) eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - return internal::traits::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows(); + return internal::traits>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - return internal::traits::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols(); + return internal::traits>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols(); } /** \returns the left hand side nested expression */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const _LhsNested& lhs() const { return m_lhs; } + const LhsNested_& lhs() const { return m_lhs; } /** \returns the right hand side nested expression */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const _RhsNested& rhs() const { return m_rhs; } + const RhsNested_& rhs() const { return m_rhs; } /** \returns the functor representing the binary operation */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; } diff --git a/libs/eigen/Eigen/src/Core/CwiseNullaryOp.h b/libs/eigen/Eigen/src/Core/CwiseNullaryOp.h index 289ec51..b33c052 100644 --- a/libs/eigen/Eigen/src/Core/CwiseNullaryOp.h +++ b/libs/eigen/Eigen/src/Core/CwiseNullaryOp.h @@ -10,6 +10,8 @@ #ifndef EIGEN_CWISE_NULLARY_OP_H #define EIGEN_CWISE_NULLARY_OP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -304,6 +306,20 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessEqualSpacedReturnType +DenseBase::EqualSpaced(Index size, const Scalar& low, const Scalar& step) { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return DenseBase::NullaryExpr(size, internal::equalspaced_op(low, step)); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessEqualSpacedReturnType +DenseBase::EqualSpaced(const Scalar& low, const Scalar& step) { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::equalspaced_op(low, step)); +} + /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ template EIGEN_DEVICE_FUNC bool DenseBase::isApproxToConstant @@ -453,6 +469,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced( return setLinSpaced(size(), low, high); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setEqualSpaced(Index newSize, const Scalar& low, + const Scalar& step) { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return derived() = Derived::NullaryExpr(newSize, internal::equalspaced_op(low, step)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setEqualSpaced(const Scalar& low, + const Scalar& step) { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return setEqualSpaced(size(), low, step); +} + // zero: /** \returns an expression of a zero matrix. diff --git a/libs/eigen/Eigen/src/Core/CwiseTernaryOp.h b/libs/eigen/Eigen/src/Core/CwiseTernaryOp.h index 9f3576f..8d24a48 100644 --- a/libs/eigen/Eigen/src/Core/CwiseTernaryOp.h +++ b/libs/eigen/Eigen/src/Core/CwiseTernaryOp.h @@ -12,6 +12,8 @@ #ifndef EIGEN_CWISE_TERNARY_OP_H #define EIGEN_CWISE_TERNARY_OP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -19,7 +21,7 @@ template struct traits > { // we must not inherit from traits since it has // the potential to cause problems with MSVC - typedef typename remove_all::type Ancestor; + typedef remove_all_t Ancestor; typedef typename traits::XprKind XprKind; enum { RowsAtCompileTime = traits::RowsAtCompileTime, @@ -41,10 +43,10 @@ struct traits > { typedef typename Arg1::Nested Arg1Nested; typedef typename Arg2::Nested Arg2Nested; typedef typename Arg3::Nested Arg3Nested; - typedef typename remove_reference::type _Arg1Nested; - typedef typename remove_reference::type _Arg2Nested; - typedef typename remove_reference::type _Arg3Nested; - enum { Flags = _Arg1Nested::Flags & RowMajorBit }; + typedef std::remove_reference_t Arg1Nested_; + typedef std::remove_reference_t Arg2Nested_; + typedef std::remove_reference_t Arg3Nested_; + enum { Flags = Arg1Nested_::Flags & RowMajorBit }; }; } // end namespace internal @@ -87,9 +89,23 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl< internal::no_assignment_operator { public: - typedef typename internal::remove_all::type Arg1; - typedef typename internal::remove_all::type Arg2; - typedef typename internal::remove_all::type Arg3; + typedef internal::remove_all_t Arg1; + typedef internal::remove_all_t Arg2; + typedef internal::remove_all_t Arg3; + + // require the sizes to match + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2) + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3) + + // The index types should match + EIGEN_STATIC_ASSERT((internal::is_same< + typename internal::traits::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same< + typename internal::traits::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) typedef typename CwiseTernaryOpImpl< TernaryOp, Arg1Type, Arg2Type, Arg3Type, @@ -99,29 +115,15 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl< typedef typename internal::ref_selector::type Arg1Nested; typedef typename internal::ref_selector::type Arg2Nested; typedef typename internal::ref_selector::type Arg3Nested; - typedef typename internal::remove_reference::type _Arg1Nested; - typedef typename internal::remove_reference::type _Arg2Nested; - typedef typename internal::remove_reference::type _Arg3Nested; + typedef std::remove_reference_t Arg1Nested_; + typedef std::remove_reference_t Arg2Nested_; + typedef std::remove_reference_t Arg3Nested_; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2, const Arg3& a3, const TernaryOp& func = TernaryOp()) : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) { - // require the sizes to match - EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2) - EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3) - - // The index types should match - EIGEN_STATIC_ASSERT((internal::is_same< - typename internal::traits::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same< - typename internal::traits::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() && a1.rows() == a3.rows() && a1.cols() == a3.cols()); } @@ -130,14 +132,14 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl< EIGEN_STRONG_INLINE Index rows() const { // return the fixed size type if available to enable compile time // optimizations - if (internal::traits::type>:: + if (internal::traits>:: RowsAtCompileTime == Dynamic && - internal::traits::type>:: + internal::traits>:: RowsAtCompileTime == Dynamic) return m_arg3.rows(); - else if (internal::traits::type>:: + else if (internal::traits>:: RowsAtCompileTime == Dynamic && - internal::traits::type>:: + internal::traits>:: RowsAtCompileTime == Dynamic) return m_arg2.rows(); else @@ -147,14 +149,14 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl< EIGEN_STRONG_INLINE Index cols() const { // return the fixed size type if available to enable compile time // optimizations - if (internal::traits::type>:: + if (internal::traits>:: ColsAtCompileTime == Dynamic && - internal::traits::type>:: + internal::traits>:: ColsAtCompileTime == Dynamic) return m_arg3.cols(); - else if (internal::traits::type>:: + else if (internal::traits>:: ColsAtCompileTime == Dynamic && - internal::traits::type>:: + internal::traits>:: ColsAtCompileTime == Dynamic) return m_arg2.cols(); else @@ -163,13 +165,13 @@ class CwiseTernaryOp : public CwiseTernaryOpImpl< /** \returns the first argument nested expression */ EIGEN_DEVICE_FUNC - const _Arg1Nested& arg1() const { return m_arg1; } + const Arg1Nested_& arg1() const { return m_arg1; } /** \returns the first argument nested expression */ EIGEN_DEVICE_FUNC - const _Arg2Nested& arg2() const { return m_arg2; } + const Arg2Nested_& arg2() const { return m_arg2; } /** \returns the third argument nested expression */ EIGEN_DEVICE_FUNC - const _Arg3Nested& arg3() const { return m_arg3; } + const Arg3Nested_& arg3() const { return m_arg3; } /** \returns the functor representing the ternary operation */ EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; } diff --git a/libs/eigen/Eigen/src/Core/CwiseUnaryOp.h b/libs/eigen/Eigen/src/Core/CwiseUnaryOp.h index e68c4f7..ff7d0b9 100644 --- a/libs/eigen/Eigen/src/Core/CwiseUnaryOp.h +++ b/libs/eigen/Eigen/src/Core/CwiseUnaryOp.h @@ -11,6 +11,8 @@ #ifndef EIGEN_CWISE_UNARY_OP_H #define EIGEN_CWISE_UNARY_OP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -22,9 +24,9 @@ struct traits > UnaryOp(const typename XprType::Scalar&) >::type Scalar; typedef typename XprType::Nested XprTypeNested; - typedef typename remove_reference::type _XprTypeNested; + typedef std::remove_reference_t XprTypeNested_; enum { - Flags = _XprTypeNested::Flags & RowMajorBit + Flags = XprTypeNested_::Flags & RowMajorBit }; }; } @@ -59,7 +61,7 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp) typedef typename internal::ref_selector::type XprTypeNested; - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) @@ -76,12 +78,12 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::type& + const internal::remove_all_t& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::remove_all::type& + internal::remove_all_t& nestedExpression() { return m_xpr; } protected: diff --git a/libs/eigen/Eigen/src/Core/CwiseUnaryView.h b/libs/eigen/Eigen/src/Core/CwiseUnaryView.h index a06d762..b4539a6 100644 --- a/libs/eigen/Eigen/src/Core/CwiseUnaryView.h +++ b/libs/eigen/Eigen/src/Core/CwiseUnaryView.h @@ -10,35 +10,42 @@ #ifndef EIGEN_CWISE_UNARY_VIEW_H #define EIGEN_CWISE_UNARY_VIEW_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template -struct traits > +template +struct traits > : traits { typedef typename result_of< ViewOp(const typename traits::Scalar&) >::type Scalar; typedef typename MatrixType::Nested MatrixTypeNested; - typedef typename remove_all::type _MatrixTypeNested; + typedef remove_all_t MatrixTypeNested_; enum { FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, - Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions + Flags = traits::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions MatrixTypeInnerStride = inner_stride_at_compile_time::ret, // need to cast the sizeof's from size_t to int explicitly, otherwise: // "error: no integral type can represent all of the enumerator values - InnerStrideAtCompileTime = MatrixTypeInnerStride == Dynamic - ? int(Dynamic) - : int(MatrixTypeInnerStride) * int(sizeof(typename traits::Scalar) / sizeof(Scalar)), - OuterStrideAtCompileTime = outer_stride_at_compile_time::ret == Dynamic - ? int(Dynamic) - : outer_stride_at_compile_time::ret * int(sizeof(typename traits::Scalar) / sizeof(Scalar)) + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 + ? (MatrixTypeInnerStride == Dynamic + ? int(Dynamic) + : int(MatrixTypeInnerStride) * int(sizeof(typename traits::Scalar) / sizeof(Scalar))) + : int(StrideType::InnerStrideAtCompileTime), + + OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 + ? (outer_stride_at_compile_time::ret == Dynamic + ? int(Dynamic) + : outer_stride_at_compile_time::ret * int(sizeof(typename traits::Scalar) / sizeof(Scalar))) + : int(StrideType::OuterStrideAtCompileTime) }; }; } -template +template class CwiseUnaryViewImpl; /** \class CwiseUnaryView @@ -54,15 +61,15 @@ class CwiseUnaryViewImpl; * * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp */ -template -class CwiseUnaryView : public CwiseUnaryViewImpl::StorageKind> +template +class CwiseUnaryView : public CwiseUnaryViewImpl::StorageKind> { public: - typedef typename CwiseUnaryViewImpl::StorageKind>::Base Base; + typedef typename CwiseUnaryViewImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView) typedef typename internal::ref_selector::non_const_type MatrixTypeNested; - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) : m_matrix(mat), m_functor(func) {} @@ -78,11 +85,11 @@ class CwiseUnaryView : public CwiseUnaryViewImpl::type& + EIGEN_DEVICE_FUNC const internal::remove_all_t& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC typename internal::remove_reference::type& + EIGEN_DEVICE_FUNC std::remove_reference_t& nestedExpression() { return m_matrix; } protected: @@ -91,22 +98,22 @@ class CwiseUnaryView : public CwiseUnaryViewImpl +template class CwiseUnaryViewImpl - : public internal::generic_xpr_base >::type + : public internal::generic_xpr_base >::type { public: - typedef typename internal::generic_xpr_base >::type Base; + typedef typename internal::generic_xpr_base >::type Base; }; -template -class CwiseUnaryViewImpl - : public internal::dense_xpr_base< CwiseUnaryView >::type +template +class CwiseUnaryViewImpl + : public internal::dense_xpr_base< CwiseUnaryView >::type { public: - typedef CwiseUnaryView Derived; - typedef typename internal::dense_xpr_base< CwiseUnaryView >::type Base; + typedef CwiseUnaryView Derived; + typedef typename internal::dense_xpr_base< CwiseUnaryView >::type Base; EIGEN_DENSE_PUBLIC_INTERFACE(Derived) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl) @@ -116,12 +123,16 @@ class CwiseUnaryViewImpl EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { - return derived().nestedExpression().innerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); + return StrideType::InnerStrideAtCompileTime != 0 + ? int(StrideType::InnerStrideAtCompileTime) + : derived().nestedExpression().innerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { - return derived().nestedExpression().outerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); + return StrideType::OuterStrideAtCompileTime != 0 + ? int(StrideType::OuterStrideAtCompileTime) + : derived().nestedExpression().outerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } protected: EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl) diff --git a/libs/eigen/Eigen/src/Core/DenseBase.h b/libs/eigen/Eigen/src/Core/DenseBase.h index 9b16db6..bcfd0f6 100644 --- a/libs/eigen/Eigen/src/Core/DenseBase.h +++ b/libs/eigen/Eigen/src/Core/DenseBase.h @@ -11,17 +11,12 @@ #ifndef EIGEN_DENSEBASE_H #define EIGEN_DENSEBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { -namespace internal { - // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type. -// This dummy function simply aims at checking that at compile time. -static inline void check_DenseIndex_is_signed() { - EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE) -} - -} // end namespace internal +EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE) /** \class DenseBase * \ingroup Core_Module @@ -110,8 +105,7 @@ template class DenseBase * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */ - SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, - internal::traits::ColsAtCompileTime>::ret), + SizeAtCompileTime = (internal::size_of_xpr_at_compile_time::ret), /**< This is equal to the number of coefficients, i.e. the number of * rows times the number of columns, or to \a Dynamic if this is not * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ @@ -138,8 +132,8 @@ template class DenseBase * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime */ - MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, - internal::traits::MaxColsAtCompileTime>::ret), + MaxSizeAtCompileTime = internal::size_at_compile_time(internal::traits::MaxRowsAtCompileTime, + internal::traits::MaxColsAtCompileTime), /**< This value is equal to the maximum possible number of coefficients that this expression * might have. If this expression might have an arbitrarily high number of coefficients, * this value is set to \a Dynamic. @@ -206,13 +200,8 @@ template class DenseBase * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed * that the return type of eval() is either PlainObject or const PlainObject&. */ - typedef typename internal::conditional::XprKind,MatrixXpr >::value, - PlainMatrix, PlainArray>::type PlainObject; - - /** \returns the number of nonzero coefficients which is in practice the number - * of stored coefficients. */ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - inline Index nonZeros() const { return size(); } + typedef std::conditional_t::XprKind,MatrixXpr >::value, + PlainMatrix, PlainArray> PlainObject; /** \returns the outer size. * @@ -269,6 +258,8 @@ template class DenseBase EIGEN_DEPRECATED typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; /** \internal Represents a vector with linearly spaced coefficients that allows random access. */ typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; + /** \internal Represents a vector with equally spaced coefficients that allows random access. */ + typedef CwiseNullaryOp, PlainObject> RandomAccessEqualSpacedReturnType; /** \internal the return type of MatrixBase::eigenvalues() */ typedef Matrix::Scalar>::Real, internal::traits::ColsAtCompileTime, 1> EigenvaluesReturnType; @@ -324,9 +315,9 @@ template class DenseBase typedef Transpose TransposeReturnType; EIGEN_DEVICE_FUNC TransposeReturnType transpose(); - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; EIGEN_DEVICE_FUNC - ConstTransposeReturnType transpose() const; + const ConstTransposeReturnType transpose() const; EIGEN_DEVICE_FUNC void transposeInPlace(); @@ -347,6 +338,11 @@ template class DenseBase EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const RandomAccessEqualSpacedReturnType + EqualSpaced(Index size, const Scalar& low, const Scalar& step); + EIGEN_DEVICE_FUNC static const RandomAccessEqualSpacedReturnType + EqualSpaced(const Scalar& low, const Scalar& step); + template EIGEN_DEVICE_FUNC static const CwiseNullaryOp NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func); @@ -368,6 +364,8 @@ template class DenseBase EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value); EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high); EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC Derived& setEqualSpaced(Index size, const Scalar& low, const Scalar& step); + EIGEN_DEVICE_FUNC Derived& setEqualSpaced(const Scalar& low, const Scalar& step); EIGEN_DEVICE_FUNC Derived& setZero(); EIGEN_DEVICE_FUNC Derived& setOnes(); EIGEN_DEVICE_FUNC Derived& setRandom(); @@ -387,15 +385,15 @@ template class DenseBase EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits::dummy_precision()) const; - inline bool hasNaN() const; - inline bool allFinite() const; + EIGEN_DEVICE_FUNC inline bool hasNaN() const; + EIGEN_DEVICE_FUNC inline bool allFinite() const; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other); - typedef typename internal::add_const_on_value_type::type>::type EvalReturnType; + typedef internal::add_const_on_value_type_t::type> EvalReturnType; /** \returns the matrix or vector obtained by evaluating this expression. * * Notice that in the case of a plain matrix or vector (not an expression) this function just returns @@ -439,9 +437,9 @@ template class DenseBase EIGEN_DEVICE_FUNC inline const ForceAlignedAccess forceAlignedAccess() const; EIGEN_DEVICE_FUNC inline ForceAlignedAccess forceAlignedAccess(); template EIGEN_DEVICE_FUNC - inline const typename internal::conditional,Derived&>::type forceAlignedAccessIf() const; + inline const std::conditional_t,Derived&> forceAlignedAccessIf() const; template EIGEN_DEVICE_FUNC - inline typename internal::conditional,Derived&>::type forceAlignedAccessIf(); + inline std::conditional_t,Derived&> forceAlignedAccessIf(); EIGEN_DEVICE_FUNC Scalar sum() const; EIGEN_DEVICE_FUNC Scalar mean() const; @@ -621,27 +619,21 @@ template class DenseBase /** This is the const version of iterator (aka read-only) */ typedef random_access_iterator_type const_iterator; #else - typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, - internal::pointer_based_stl_iterator, - internal::generic_randaccess_stl_iterator - >::type iterator_type; + typedef std::conditional_t< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + > iterator_type; - typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, - internal::pointer_based_stl_iterator, - internal::generic_randaccess_stl_iterator - >::type const_iterator_type; + typedef std::conditional_t< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + > const_iterator_type; // Stl-style iterators are supported only for vectors. - typedef typename internal::conditional< IsVectorAtCompileTime, - iterator_type, - void - >::type iterator; + typedef std::conditional_t iterator; - typedef typename internal::conditional< IsVectorAtCompileTime, - const_iterator_type, - void - >::type const_iterator; + typedef std::conditional_t const_iterator; #endif inline iterator begin(); @@ -678,14 +670,13 @@ template class DenseBase protected: EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase) /** Default constructor. Do nothing. */ - EIGEN_DEVICE_FUNC DenseBase() - { + EIGEN_DEVICE_FUNC constexpr DenseBase() { /* Just checks for self-consistency of the flags. * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down */ #ifdef EIGEN_INTERNAL_DEBUGGING - EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor)) - && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, int(!IsRowMajor))), + EIGEN_STATIC_ASSERT((internal::check_implication(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor)) + && internal::check_implication(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, int(!IsRowMajor))), INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION) #endif } diff --git a/libs/eigen/Eigen/src/Core/DenseCoeffsBase.h b/libs/eigen/Eigen/src/Core/DenseCoeffsBase.h index 37fcdb5..7f0bcf4 100644 --- a/libs/eigen/Eigen/src/Core/DenseCoeffsBase.h +++ b/libs/eigen/Eigen/src/Core/DenseCoeffsBase.h @@ -10,12 +10,14 @@ #ifndef EIGEN_DENSECOEFFSBASE_H #define EIGEN_DENSECOEFFSBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { template struct add_const_on_value_type_if_arithmetic { - typedef typename conditional::value, T, typename add_const_on_value_type::type>::type type; + typedef std::conditional_t::value, T, add_const_on_value_type_t> type; }; } @@ -43,13 +45,13 @@ class DenseCoeffsBase : public EigenBase // - This is the return type of the coeff() method. // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value). - // - The is_artihmetic check is required since "const int", "const double", etc. will cause warnings on some systems + // - The is_arithmetic check is required since "const int", "const double", etc. will cause warnings on some systems // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is // not possible, since the underlying expressions might not offer a valid address the reference could be referring to. - typedef typename internal::conditional::Flags&LvalueBit), - const Scalar&, - typename internal::conditional::value, Scalar, const Scalar>::type - >::type CoeffReturnType; + typedef std::conditional_t::Flags&LvalueBit), + const Scalar&, + std::conditional_t::value, Scalar, const Scalar> + > CoeffReturnType; typedef typename internal::add_const_on_value_type_if_arithmetic< typename internal::packet_traits::type diff --git a/libs/eigen/Eigen/src/Core/DenseStorage.h b/libs/eigen/Eigen/src/Core/DenseStorage.h index 08ef6c5..cf588bd 100644 --- a/libs/eigen/Eigen/src/Core/DenseStorage.h +++ b/libs/eigen/Eigen/src/Core/DenseStorage.h @@ -18,20 +18,20 @@ #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) #endif +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { struct constructor_without_unaligned_array_assert {}; -template -EIGEN_DEVICE_FUNC -void check_static_allocation_size() -{ - // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit - #if EIGEN_STACK_ALLOCATION_LIMIT +template +EIGEN_DEVICE_FUNC constexpr void check_static_allocation_size() { +// if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit +#if EIGEN_STACK_ALLOCATION_LIMIT EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG); - #endif +#endif } /** \internal @@ -45,35 +45,30 @@ struct plain_array { T array[Size]; - EIGEN_DEVICE_FUNC - plain_array() - { - check_static_allocation_size(); - } + EIGEN_DEVICE_FUNC constexpr plain_array() { check_static_allocation_size(); } - EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { - check_static_allocation_size(); + EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) { + check_static_allocation_size(); } }; #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) -#elif EIGEN_GNUC_AT_LEAST(4,7) +#elif EIGEN_COMP_GNUC // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; } #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \ + eigen_assert((internal::is_constant_evaluated() \ + || (internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0) \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); #else #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \ + eigen_assert((internal::is_constant_evaluated() || (internal::UIntPtr(array) & (sizemask)) == 0) \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); @@ -84,17 +79,13 @@ struct plain_array { EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; - EIGEN_DEVICE_FUNC - plain_array() - { + EIGEN_DEVICE_FUNC constexpr plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); check_static_allocation_size(); } - EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { - check_static_allocation_size(); + EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) { + check_static_allocation_size(); } }; @@ -103,17 +94,13 @@ struct plain_array { EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; - EIGEN_DEVICE_FUNC - plain_array() - { + EIGEN_DEVICE_FUNC constexpr plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); check_static_allocation_size(); } - EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { - check_static_allocation_size(); + EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) { + check_static_allocation_size(); } }; @@ -122,17 +109,13 @@ struct plain_array { EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; - EIGEN_DEVICE_FUNC - plain_array() - { + EIGEN_DEVICE_FUNC constexpr plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); check_static_allocation_size(); } - EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { - check_static_allocation_size(); + EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) { + check_static_allocation_size(); } }; @@ -141,17 +124,13 @@ struct plain_array { EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; - EIGEN_DEVICE_FUNC - plain_array() - { + EIGEN_DEVICE_FUNC constexpr plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); check_static_allocation_size(); } - EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { - check_static_allocation_size(); + EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) { + check_static_allocation_size(); } }; @@ -159,8 +138,8 @@ template struct plain_array { T array[1]; - EIGEN_DEVICE_FUNC plain_array() {} - EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} + EIGEN_DEVICE_FUNC constexpr plain_array() {} + EIGEN_DEVICE_FUNC constexpr plain_array(constructor_without_unaligned_array_assert) {} }; struct plain_array_helper { @@ -201,57 +180,32 @@ struct plain_array_helper { * * \sa Matrix */ -template class DenseStorage; +template class DenseStorage; // purely fixed-size matrix -template class DenseStorage +template class DenseStorage { - internal::plain_array m_data; + internal::plain_array m_data; public: - EIGEN_DEVICE_FUNC DenseStorage() { + constexpr EIGEN_DEVICE_FUNC DenseStorage() { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } - EIGEN_DEVICE_FUNC - explicit DenseStorage(internal::constructor_without_unaligned_array_assert) + EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} -#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) - EIGEN_DEVICE_FUNC +#if defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) + EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other) : m_data(other.m_data) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } #else - EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default; + EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) = default; #endif -#if !EIGEN_HAS_CXX11 - EIGEN_DEVICE_FUNC - DenseStorage& operator=(const DenseStorage& other) - { - if (this != &other) m_data = other.m_data; - return *this; - } -#else - EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default; -#endif -#if EIGEN_HAS_RVALUE_REFERENCES -#if !EIGEN_HAS_CXX11 - EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT - : m_data(std::move(other.m_data)) - { - } - EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT - { - if (this != &other) - m_data = std::move(other.m_data); - return *this; - } -#else - EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default; - EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default; -#endif -#endif - EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { + EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) = default; + EIGEN_DEVICE_FUNC constexpr DenseStorage(DenseStorage&&) = default; + EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(DenseStorage&&) = default; + EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) - eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); + eigen_internal_assert(size == rows * cols && rows == Rows_ && cols == Cols_); EIGEN_UNUSED_VARIABLE(size); EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); @@ -259,57 +213,148 @@ template class DenseSt EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { numext::swap(m_data, other.m_data); } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;} - EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} - EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {} - EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } - EIGEN_DEVICE_FUNC T *data() { return m_data.array; } + EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT { return Rows_; } + EIGEN_DEVICE_FUNC static constexpr Index cols(void) EIGEN_NOEXCEPT { return Cols_; } + EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index, Index) {} + EIGEN_DEVICE_FUNC constexpr void resize(Index, Index, Index) {} + EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; } + EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; } }; // null matrix -template class DenseStorage +template +class DenseStorage { public: - EIGEN_DEVICE_FUNC DenseStorage() {} - EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {} - EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {} - EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; } - EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {} - EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {} - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;} - EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} - EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {} - EIGEN_DEVICE_FUNC const T *data() const { return 0; } - EIGEN_DEVICE_FUNC T *data() { return 0; } + static_assert(Rows_ * Cols_ == 0, "The fixed number of rows times columns must equal the storage size."); + EIGEN_DEVICE_FUNC constexpr DenseStorage() {} + EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) {} + EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage&) {} + EIGEN_DEVICE_FUNC constexpr DenseStorage& operator=(const DenseStorage&) { return *this; } + EIGEN_DEVICE_FUNC constexpr DenseStorage(Index,Index,Index) {} + EIGEN_DEVICE_FUNC constexpr void swap(DenseStorage& ) {} + EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT {return Rows_;} + EIGEN_DEVICE_FUNC static constexpr Index cols(void) EIGEN_NOEXCEPT {return Cols_;} + EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index,Index,Index) {} + EIGEN_DEVICE_FUNC constexpr void resize(Index,Index,Index) {} + EIGEN_DEVICE_FUNC constexpr const T *data() const { return 0; } + EIGEN_DEVICE_FUNC constexpr T *data() { return 0; } }; // more specializations for null matrices; these are necessary to resolve ambiguities -template class DenseStorage -: public DenseStorage { }; - -template class DenseStorage -: public DenseStorage { }; - -template class DenseStorage -: public DenseStorage { }; - -// dynamic-size matrix with fixed-size storage -template class DenseStorage -{ - internal::plain_array m_data; +template +class DenseStorage { Index m_rows; Index m_cols; public: EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {} - EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) - : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {} - EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) - : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols) - { - internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data); + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {} + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_rows(other.m_rows), m_cols(other.m_cols) {} + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { + m_rows = other.m_rows; + m_cols = other.m_cols; + return *this; } + EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) { + eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {return m_cols;} + EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { + m_rows = rows; + m_cols = cols; + eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { + m_rows = rows; + m_cols = cols; + eigen_assert(m_rows * m_cols == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC const T *data() const { return nullptr; } + EIGEN_DEVICE_FUNC T *data() { return nullptr; } +}; + +template +class DenseStorage { + Index m_cols; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {} + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {} + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_cols(other.m_cols) {} + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { + m_cols = other.m_cols; + return *this; + } + EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) { + eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_cols, other.m_cols); + } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return Rows_;} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} + EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { + m_cols = cols; + eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { + m_cols = cols; + eigen_assert(Rows_ * m_cols == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC const T *data() const { return nullptr; } + EIGEN_DEVICE_FUNC T *data() { return nullptr; } +}; + +template +class DenseStorage { + Index m_rows; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {} + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : DenseStorage() {} + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_rows(other.m_rows) {} + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { + m_rows = other.m_rows; + return *this; + } + EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) { + eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_rows, other.m_rows); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return Cols_;} + EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { + m_rows = rows; + eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { + m_rows = rows; + eigen_assert(m_rows * Cols_ == 0 && "The number of rows times columns must equal the storage size."); + } + EIGEN_DEVICE_FUNC const T *data() const { return nullptr; } + EIGEN_DEVICE_FUNC T *data() { return nullptr; } +}; + +// dynamic-size matrix with fixed-size storage +template +class DenseStorage +{ + internal::plain_array m_data; + Index m_rows; + Index m_cols; + public: + EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(), m_rows(0), m_cols(0) {} + EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {} + EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other) + : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols) { + internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data); + } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { if (this != &other) @@ -320,113 +365,121 @@ template class DenseStorage class DenseStorage +template +class DenseStorage { - internal::plain_array m_data; + internal::plain_array m_data; Index m_rows; public: - EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {} - EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) - : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {} - EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) - : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows) - { - internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data); - } - + EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_rows(0) {} + EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {} + EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other) + : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows) { + internal::plain_array_helper::copy(other.m_data, m_rows * Cols_, m_data); + } + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { if (this != &other) { m_rows = other.m_rows; - internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data); + internal::plain_array_helper::copy(other.m_data, m_rows * Cols_, m_data); } return *this; } - EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {} + EIGEN_DEVICE_FUNC constexpr DenseStorage(Index, Index rows, Index) : m_rows(rows) {} EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { - internal::plain_array_helper::swap(m_data, m_rows * _Cols, other.m_data, other.m_rows * _Cols); + internal::plain_array_helper::swap(m_data, m_rows * Cols_, other.m_data, other.m_rows * Cols_); numext::swap(m_rows, other.m_rows); } - EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;} - EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; } - EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; } - EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } - EIGEN_DEVICE_FUNC T *data() { return m_data.array; } + EIGEN_DEVICE_FUNC constexpr Index rows(void) const EIGEN_NOEXCEPT { return m_rows; } + EIGEN_DEVICE_FUNC constexpr Index cols(void) const EIGEN_NOEXCEPT { return Cols_; } + EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index rows, Index) { m_rows = rows; } + EIGEN_DEVICE_FUNC constexpr void resize(Index, Index rows, Index) { m_rows = rows; } + EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; } + EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; } }; // dynamic-size matrix with fixed-size storage and fixed height -template class DenseStorage +template +class DenseStorage { - internal::plain_array m_data; + internal::plain_array m_data; Index m_cols; public: - EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {} - EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) - : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {} - EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) - : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols) - { - internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data); - } + EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_cols(0) {} + EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {} + EIGEN_DEVICE_FUNC constexpr DenseStorage(const DenseStorage& other) + : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols) { + internal::plain_array_helper::copy(other.m_data, Rows_ * m_cols, m_data); + } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { if (this != &other) { m_cols = other.m_cols; - internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data); + internal::plain_array_helper::copy(other.m_data, Rows_ * m_cols, m_data); } return *this; } EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {} EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { - internal::plain_array_helper::swap(m_data, _Rows * m_cols, other.m_data, _Rows * other.m_cols); + internal::plain_array_helper::swap(m_data, Rows_ * m_cols, other.m_data, Rows_ * other.m_cols); numext::swap(m_cols, other.m_cols); } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;} - EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} - EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; } - EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; } - EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } - EIGEN_DEVICE_FUNC T *data() { return m_data.array; } + EIGEN_DEVICE_FUNC constexpr Index rows(void) const EIGEN_NOEXCEPT { return Rows_; } + EIGEN_DEVICE_FUNC constexpr Index cols(void) const EIGEN_NOEXCEPT { return m_cols; } + EIGEN_DEVICE_FUNC constexpr void conservativeResize(Index, Index, Index cols) { m_cols = cols; } + EIGEN_DEVICE_FUNC constexpr void resize(Index, Index, Index cols) { m_cols = cols; } + EIGEN_DEVICE_FUNC constexpr const T* data() const { return m_data.array; } + EIGEN_DEVICE_FUNC constexpr T* data() { return m_data.array; } }; // purely dynamic matrix. -template class DenseStorage +template +class DenseStorage { T *m_data; Index m_rows; Index m_cols; public: - EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {} - EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) + EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_rows(0), m_cols(0) {} + EIGEN_DEVICE_FUNC explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0), m_cols(0) {} - EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) - : m_data(internal::conditional_aligned_new_auto(size)), m_rows(rows), m_cols(cols) - { + EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) + : m_data(internal::conditional_aligned_new_auto(size)), + m_rows(rows), + m_cols(cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0); - } + } EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) - : m_data(internal::conditional_aligned_new_auto(other.m_rows*other.m_cols)) + : m_data(internal::conditional_aligned_new_auto(other.m_rows*other.m_cols)) , m_rows(other.m_rows) , m_cols(other.m_cols) { @@ -442,7 +495,6 @@ template class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } + EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { numext::swap(m_data,other.m_data); @@ -473,7 +524,7 @@ template class DenseStorage(m_data, size, m_rows*m_cols); + m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*m_cols); m_rows = rows; m_cols = cols; } @@ -481,9 +532,9 @@ template class DenseStorage(m_data, m_rows*m_cols); + internal::conditional_aligned_delete_auto(m_data, m_rows*m_cols); if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative - m_data = internal::conditional_aligned_new_auto(size); + m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) @@ -496,25 +547,25 @@ template class DenseStorage class DenseStorage -{ +template +class DenseStorage { T *m_data; Index m_cols; public: - EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {} - explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {} - EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto(size)), m_cols(cols) - { + EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_cols(0) {} + explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {} + EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) + : m_data(internal::conditional_aligned_new_auto(size)), m_cols(cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) - eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0); + eigen_internal_assert(size==rows*cols && rows==Rows_ && cols >=0); EIGEN_UNUSED_VARIABLE(rows); - } + } EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) - : m_data(internal::conditional_aligned_new_auto(_Rows*other.m_cols)) + : m_data(internal::conditional_aligned_new_auto(Rows_*other.m_cols)) , m_cols(other.m_cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows) - internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data); + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*Rows_) + internal::smart_copy(other.m_data, other.m_data+Rows_*m_cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { @@ -525,7 +576,6 @@ template class DenseStorage class DenseStorage(m_data, _Rows*m_cols); } + EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto(m_data, Rows_*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { numext::swap(m_data,other.m_data); numext::swap(m_cols,other.m_cols); } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} + EIGEN_DEVICE_FUNC static constexpr Index rows(void) EIGEN_NOEXCEPT { return Rows_; } EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) { - m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, _Rows*m_cols); + m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, Rows_*m_cols); m_cols = cols; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols) { - if(size != _Rows*m_cols) + if(size != Rows_*m_cols) { - internal::conditional_aligned_delete_auto(m_data, _Rows*m_cols); + internal::conditional_aligned_delete_auto(m_data, Rows_*m_cols); if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative - m_data = internal::conditional_aligned_new_auto(size); + m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) @@ -572,25 +621,26 @@ template class DenseStorage class DenseStorage +template +class DenseStorage { T *m_data; Index m_rows; public: - EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {} - explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {} - EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto(size)), m_rows(rows) - { + EIGEN_DEVICE_FUNC constexpr DenseStorage() : m_data(0), m_rows(0) {} + explicit constexpr DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {} + EIGEN_DEVICE_FUNC constexpr DenseStorage(Index size, Index rows, Index cols) + : m_data(internal::conditional_aligned_new_auto(size)), m_rows(rows) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) - eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols); + eigen_internal_assert(size==rows*cols && rows>=0 && cols == Cols_); EIGEN_UNUSED_VARIABLE(cols); - } + } EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) - : m_data(internal::conditional_aligned_new_auto(other.m_rows*_Cols)) + : m_data(internal::conditional_aligned_new_auto(other.m_rows*Cols_)) , m_rows(other.m_rows) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols) - internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data); + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*Cols_) + internal::smart_copy(other.m_data, other.m_data+other.m_rows*Cols_, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { @@ -601,7 +651,6 @@ template class DenseStorage class DenseStorage(m_data, _Cols*m_rows); } + EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto(m_data, Cols_*m_rows); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { numext::swap(m_data,other.m_data); numext::swap(m_rows,other.m_rows); } EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC static constexpr Index cols(void) { return Cols_; } void conservativeResize(Index size, Index rows, Index) { - m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*_Cols); + m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*Cols_); m_rows = rows; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index) { - if(size != m_rows*_Cols) + if(size != m_rows*Cols_) { - internal::conditional_aligned_delete_auto(m_data, _Cols*m_rows); + internal::conditional_aligned_delete_auto(m_data, Cols_*m_rows); if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative - m_data = internal::conditional_aligned_new_auto(size); + m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) diff --git a/libs/eigen/Eigen/src/Core/Diagonal.h b/libs/eigen/Eigen/src/Core/Diagonal.h index 3112d2c..4af17dd 100644 --- a/libs/eigen/Eigen/src/Core/Diagonal.h +++ b/libs/eigen/Eigen/src/Core/Diagonal.h @@ -11,6 +11,8 @@ #ifndef EIGEN_DIAGONAL_H #define EIGEN_DIAGONAL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class Diagonal @@ -18,8 +20,8 @@ namespace Eigen { * * \brief Expression of a diagonal/subdiagonal/superdiagonal in a matrix * - * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal - * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal. + * \tparam MatrixType the type of the object in which we are taking a sub/main/super diagonal + * \tparam DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal. * A positive value means a superdiagonal, a negative value means a subdiagonal. * You can also use DynamicIndex so the index can be set at runtime. * @@ -38,21 +40,21 @@ struct traits > : traits { typedef typename ref_selector::type MatrixTypeNested; - typedef typename remove_reference::type _MatrixTypeNested; + typedef std::remove_reference_t MatrixTypeNested_; typedef typename MatrixType::StorageKind StorageKind; enum { RowsAtCompileTime = (int(DiagIndex) == DynamicIndex || int(MatrixType::SizeAtCompileTime) == Dynamic) ? Dynamic - : (EIGEN_PLAIN_ENUM_MIN(MatrixType::RowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0), - MatrixType::ColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))), + : (plain_enum_min(MatrixType::RowsAtCompileTime - plain_enum_max(-DiagIndex, 0), + MatrixType::ColsAtCompileTime - plain_enum_max( DiagIndex, 0))), ColsAtCompileTime = 1, MaxRowsAtCompileTime = int(MatrixType::MaxSizeAtCompileTime) == Dynamic ? Dynamic - : DiagIndex == DynamicIndex ? EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::MaxRowsAtCompileTime, - MatrixType::MaxColsAtCompileTime) - : (EIGEN_PLAIN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0), - MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))), + : DiagIndex == DynamicIndex ? min_size_prefer_fixed(MatrixType::MaxRowsAtCompileTime, + MatrixType::MaxColsAtCompileTime) + : (plain_enum_min(MatrixType::MaxRowsAtCompileTime - plain_enum_max(-DiagIndex, 0), + MatrixType::MaxColsAtCompileTime - plain_enum_max( DiagIndex, 0))), MaxColsAtCompileTime = 1, MaskLvalueBit = is_lvalue::value ? LvalueBit : 0, - Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions + Flags = (unsigned int)MatrixTypeNested_::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions MatrixTypeOuterStride = outer_stride_at_compile_time::ret, InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1, OuterStrideAtCompileTime = 0 @@ -60,12 +62,12 @@ struct traits > }; } -template class Diagonal - : public internal::dense_xpr_base< Diagonal >::type +template class Diagonal + : public internal::dense_xpr_base< Diagonal >::type { public: - enum { DiagIndex = _DiagIndex }; + enum { DiagIndex = DiagIndex_ }; typedef typename internal::dense_xpr_base::type Base; EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) @@ -95,11 +97,11 @@ template class Diagonal EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const EIGEN_NOEXCEPT { return 0; } - typedef typename internal::conditional< - internal::is_lvalue::value, - Scalar, - const Scalar - >::type ScalarWithConstIfNotLvalue; + typedef std::conditional_t< + internal::is_lvalue::value, + Scalar, + const Scalar + > ScalarWithConstIfNotLvalue; EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); } @@ -145,7 +147,7 @@ template class Diagonal } EIGEN_DEVICE_FUNC - inline const typename internal::remove_all::type& + inline const internal::remove_all_t& nestedExpression() const { return m_matrix; @@ -191,7 +193,8 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline +const typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -209,18 +212,18 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline Diagonal MatrixBase::diagonal(Index index) { - return DiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** This is the const version of diagonal(Index). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline const Diagonal MatrixBase::diagonal(Index index) const { - return ConstDiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this @@ -237,20 +240,20 @@ MatrixBase::diagonal(Index index) const template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template DiagonalIndexReturnType::Type +inline Diagonal MatrixBase::diagonal() { - return typename DiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } /** This is the const version of diagonal(). */ template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type +inline const Diagonal MatrixBase::diagonal() const { - return typename ConstDiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/DiagonalMatrix.h b/libs/eigen/Eigen/src/Core/DiagonalMatrix.h index 542685c..405cc71 100644 --- a/libs/eigen/Eigen/src/Core/DiagonalMatrix.h +++ b/libs/eigen/Eigen/src/Core/DiagonalMatrix.h @@ -11,9 +11,23 @@ #ifndef EIGEN_DIAGONALMATRIX_H #define EIGEN_DIAGONALMATRIX_H -namespace Eigen { +#include "./InternalHeaderCheck.h" -#ifndef EIGEN_PARSED_BY_DOXYGEN +namespace Eigen { + +/** \class DiagonalBase + * \ingroup Core_Module + * + * \brief Base class for diagonal matrices and expressions + * + * This is the base class that is inherited by diagonal matrix and related expression + * types, which internally use a vector for storing the diagonal entries. Diagonal + * types always represent square matrices. + * + * \tparam Derived is the derived type, a DiagonalMatrix or DiagonalWrapper. + * + * \sa class DiagonalMatrix, class DiagonalWrapper + */ template class DiagonalBase : public EigenBase { @@ -37,24 +51,35 @@ class DiagonalBase : public EigenBase typedef DenseMatrixType DenseType; typedef DiagonalMatrix PlainObject; + /** \returns a reference to the derived object. */ EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast(this); } + /** \returns a const reference to the derived object. */ EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast(this); } + /** + * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type, + * not an expression. + * \returns A dense matrix, with its diagonal entries set from the the derived object. */ EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } + /** \returns a reference to the derived object's vector of diagonal coefficients. */ EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } + /** \returns a const reference to the derived object's vector of diagonal coefficients. */ EIGEN_DEVICE_FUNC inline DiagonalVectorType& diagonal() { return derived().diagonal(); } - EIGEN_DEVICE_FUNC + /** \returns the number of rows. */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return diagonal().size(); } - EIGEN_DEVICE_FUNC + /** \returns the number of columns. */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return diagonal().size(); } + /** \returns the diagonal matrix product of \c *this by the dense matrix, \a matrix */ template EIGEN_DEVICE_FUNC const Product @@ -63,88 +88,99 @@ class DiagonalBase : public EigenBase return Product(derived(),matrix.derived()); } - typedef DiagonalWrapper, const DiagonalVectorType> > InverseReturnType; - EIGEN_DEVICE_FUNC - inline const InverseReturnType - inverse() const - { - return InverseReturnType(diagonal().cwiseInverse()); - } - - EIGEN_DEVICE_FUNC - inline const DiagonalWrapper - operator*(const Scalar& scalar) const - { - return DiagonalWrapper(diagonal() * scalar); - } - EIGEN_DEVICE_FUNC - friend inline const DiagonalWrapper - operator*(const Scalar& scalar, const DiagonalBase& other) - { - return DiagonalWrapper(scalar * other.diagonal()); + template + using DiagonalProductReturnType = DiagonalWrapper; + + /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a other */ + template + EIGEN_DEVICE_FUNC const DiagonalProductReturnType operator*( + const DiagonalBase& other) const { + return diagonal().cwiseProduct(other.diagonal()).asDiagonal(); } - template + using DiagonalInverseReturnType = + DiagonalWrapper, const DiagonalVectorType>>; + + /** \returns the inverse \c *this. Computed as the coefficient-wise inverse of the diagonal. */ EIGEN_DEVICE_FUNC - #ifdef EIGEN_PARSED_BY_DOXYGEN - inline unspecified_expression_type - #else - inline const DiagonalWrapper - #endif - operator+(const DiagonalBase& other) const - { + inline const DiagonalInverseReturnType inverse() const { return diagonal().cwiseInverse().asDiagonal(); } + + using DiagonalScaleReturnType = + DiagonalWrapper; + + /** \returns the product of \c *this by the scalar \a scalar */ + EIGEN_DEVICE_FUNC + inline const DiagonalScaleReturnType operator*(const Scalar& scalar) const { + return (diagonal() * scalar).asDiagonal(); + } + + using ScaleDiagonalReturnType = + DiagonalWrapper; + + /** \returns the product of a scalar and the diagonal matrix \a other */ + EIGEN_DEVICE_FUNC + friend inline const ScaleDiagonalReturnType operator*(const Scalar& scalar, const DiagonalBase& other) { + return (scalar * other.diagonal()).asDiagonal(); + } + + template + using DiagonalSumReturnType = DiagonalWrapper; + + /** \returns the sum of \c *this and the diagonal matrix \a other */ + template + EIGEN_DEVICE_FUNC inline const DiagonalSumReturnType operator+( + const DiagonalBase& other) const { return (diagonal() + other.diagonal()).asDiagonal(); } - template - EIGEN_DEVICE_FUNC - #ifdef EIGEN_PARSED_BY_DOXYGEN - inline unspecified_expression_type - #else - inline const DiagonalWrapper - #endif - operator-(const DiagonalBase& other) const - { + template + using DiagonalDifferenceReturnType = DiagonalWrapper; + + /** \returns the difference of \c *this and the diagonal matrix \a other */ + template + EIGEN_DEVICE_FUNC inline const DiagonalDifferenceReturnType operator-( + const DiagonalBase& other) const { return (diagonal() - other.diagonal()).asDiagonal(); } }; -#endif - /** \class DiagonalMatrix - * \ingroup Core_Module - * - * \brief Represents a diagonal matrix with its storage - * - * \param _Scalar the type of coefficients - * \param SizeAtCompileTime the dimension of the matrix, or Dynamic - * \param MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults - * to SizeAtCompileTime. Most of the time, you do not need to specify it. - * - * \sa class DiagonalWrapper - */ + * \ingroup Core_Module + * + * \brief Represents a diagonal matrix with its storage + * + * \tparam Scalar_ the type of coefficients + * \tparam SizeAtCompileTime the dimension of the matrix, or Dynamic + * \tparam MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults + * to SizeAtCompileTime. Most of the time, you do not need to specify it. + * + * \sa class DiagonalBase, class DiagonalWrapper + */ namespace internal { -template -struct traits > - : traits > +template +struct traits > + : traits > { - typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType; + typedef Matrix DiagonalVectorType; typedef DiagonalShape StorageKind; enum { - Flags = LvalueBit | NoPreferredStorageOrderBit + Flags = LvalueBit | NoPreferredStorageOrderBit | NestByRefBit }; }; } -template +template class DiagonalMatrix - : public DiagonalBase > + : public DiagonalBase > { public: #ifndef EIGEN_PARSED_BY_DOXYGEN typedef typename internal::traits::DiagonalVectorType DiagonalVectorType; typedef const DiagonalMatrix& Nested; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::StorageIndex StorageIndex; #endif @@ -178,10 +214,7 @@ class DiagonalMatrix EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {} - #if EIGEN_HAS_CXX11 - /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11 - * - * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients. + /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. * * \warning To construct a diagonal matrix of fixed size, the number of values passed to this * constructor must match the fixed dimension of \c *this. @@ -200,7 +233,10 @@ class DiagonalMatrix EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list>& list) : m_diagonal(list) {} - #endif // EIGEN_HAS_CXX11 + + /** \brief Constructs a DiagonalMatrix from an r-value diagonal vector type */ + EIGEN_DEVICE_FUNC + explicit inline DiagonalMatrix(DiagonalVectorType&& diag) : m_diagonal(std::move(diag)) {} /** Copy constructor. */ template @@ -239,6 +275,22 @@ class DiagonalMatrix } #endif + typedef DiagonalWrapper, DiagonalVectorType>> + InitializeReturnType; + + /** Initializes a diagonal matrix of size SizeAtCompileTime with coefficients set to zero */ + EIGEN_DEVICE_FUNC + static const InitializeReturnType Zero() { return DiagonalVectorType::Zero().asDiagonal(); } + /** Initializes a diagonal matrix of size dim with coefficients set to zero */ + EIGEN_DEVICE_FUNC + static const InitializeReturnType Zero(Index size) { return DiagonalVectorType::Zero(size).asDiagonal(); } + /** Initializes a identity matrix of size SizeAtCompileTime */ + EIGEN_DEVICE_FUNC + static const InitializeReturnType Identity() { return DiagonalVectorType::Ones().asDiagonal(); } + /** Initializes a identity matrix of size dim */ + EIGEN_DEVICE_FUNC + static const InitializeReturnType Identity(Index size) { return DiagonalVectorType::Ones(size).asDiagonal(); } + /** Resizes to given size. */ EIGEN_DEVICE_FUNC inline void resize(Index size) { m_diagonal.resize(size); } @@ -261,7 +313,7 @@ class DiagonalMatrix * * \brief Expression of a diagonal matrix * - * \param _DiagonalVectorType the type of the vector of diagonal coefficients + * \tparam DiagonalVectorType_ the type of the vector of diagonal coefficients * * This class is an expression of a diagonal matrix, but not storing its own vector of diagonal coefficients, * instead wrapping an existing vector expression. It is the return type of MatrixBase::asDiagonal() @@ -271,10 +323,10 @@ class DiagonalMatrix */ namespace internal { -template -struct traits > +template +struct traits > { - typedef _DiagonalVectorType DiagonalVectorType; + typedef DiagonalVectorType_ DiagonalVectorType; typedef typename DiagonalVectorType::Scalar Scalar; typedef typename DiagonalVectorType::StorageIndex StorageIndex; typedef DiagonalShape StorageKind; @@ -289,13 +341,13 @@ struct traits > }; } -template +template class DiagonalWrapper - : public DiagonalBase >, internal::no_assignment_operator + : public DiagonalBase >, internal::no_assignment_operator { public: #ifndef EIGEN_PARSED_BY_DOXYGEN - typedef _DiagonalVectorType DiagonalVectorType; + typedef DiagonalVectorType_ DiagonalVectorType; typedef DiagonalWrapper Nested; #endif @@ -386,6 +438,6 @@ struct Assignment } // namespace internal -} // end namespace Eigen +} // end namespace Eigen #endif // EIGEN_DIAGONALMATRIX_H diff --git a/libs/eigen/Eigen/src/Core/DiagonalProduct.h b/libs/eigen/Eigen/src/Core/DiagonalProduct.h index 7911d1c..3cd34ba 100644 --- a/libs/eigen/Eigen/src/Core/DiagonalProduct.h +++ b/libs/eigen/Eigen/src/Core/DiagonalProduct.h @@ -11,6 +11,8 @@ #ifndef EIGEN_DIAGONALPRODUCT_H #define EIGEN_DIAGONALPRODUCT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal. diff --git a/libs/eigen/Eigen/src/Core/Dot.h b/libs/eigen/Eigen/src/Core/Dot.h index 5c3441b..0c13192 100644 --- a/libs/eigen/Eigen/src/Core/Dot.h +++ b/libs/eigen/Eigen/src/Core/Dot.h @@ -10,6 +10,8 @@ #ifndef EIGEN_DOT_H #define EIGEN_DOT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -18,14 +20,9 @@ namespace internal { // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE // looking at the static assertions. Thus this is a trick to get better compile errors. template + bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime && + ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) || + (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))> struct dot_nocheck { typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; @@ -123,8 +120,8 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { - typedef typename internal::nested_eval::type _Nested; - _Nested n(derived()); + typedef typename internal::nested_eval::type Nested_; + Nested_ n(derived()); RealScalar z = n.squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU if(z>RealScalar(0)) @@ -166,8 +163,8 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { - typedef typename internal::nested_eval::type _Nested; - _Nested n(derived()); + typedef typename internal::nested_eval::type Nested_; + Nested_ n(derived()); RealScalar w = n.cwiseAbs().maxCoeff(); RealScalar z = (n/w).squaredNorm(); if(z>RealScalar(0)) diff --git a/libs/eigen/Eigen/src/Core/EigenBase.h b/libs/eigen/Eigen/src/Core/EigenBase.h index 6b3c7d3..105488d 100644 --- a/libs/eigen/Eigen/src/Core/EigenBase.h +++ b/libs/eigen/Eigen/src/Core/EigenBase.h @@ -11,6 +11,8 @@ #ifndef EIGEN_EIGENBASE_H #define EIGEN_EIGENBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class EigenBase diff --git a/libs/eigen/Eigen/src/Core/ForceAlignedAccess.h b/libs/eigen/Eigen/src/Core/ForceAlignedAccess.h index 817a43a..b00785e 100644 --- a/libs/eigen/Eigen/src/Core/ForceAlignedAccess.h +++ b/libs/eigen/Eigen/src/Core/ForceAlignedAccess.h @@ -10,6 +10,8 @@ #ifndef EIGEN_FORCEALIGNEDACCESS_H #define EIGEN_FORCEALIGNEDACCESS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class ForceAlignedAccess @@ -128,7 +130,7 @@ MatrixBase::forceAlignedAccess() */ template template -inline typename internal::add_const_on_value_type,Derived&>::type>::type +inline add_const_on_value_type_t,Derived&>> MatrixBase::forceAlignedAccessIf() const { return derived(); // FIXME This should not work but apparently is never used @@ -139,7 +141,7 @@ MatrixBase::forceAlignedAccessIf() const */ template template -inline typename internal::conditional,Derived&>::type +inline std::conditional_t,Derived&> MatrixBase::forceAlignedAccessIf() { return derived(); // FIXME This should not work but apparently is never used diff --git a/libs/eigen/Eigen/src/Core/Fuzzy.h b/libs/eigen/Eigen/src/Core/Fuzzy.h index 43aa49b..b16b2da 100644 --- a/libs/eigen/Eigen/src/Core/Fuzzy.h +++ b/libs/eigen/Eigen/src/Core/Fuzzy.h @@ -11,6 +11,8 @@ #ifndef EIGEN_FUZZY_H #define EIGEN_FUZZY_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal diff --git a/libs/eigen/Eigen/src/Core/GeneralProduct.h b/libs/eigen/Eigen/src/Core/GeneralProduct.h index 6906aa7..661a3c4 100644 --- a/libs/eigen/Eigen/src/Core/GeneralProduct.h +++ b/libs/eigen/Eigen/src/Core/GeneralProduct.h @@ -11,6 +11,8 @@ #ifndef EIGEN_GENERAL_PRODUCT_H #define EIGEN_GENERAL_PRODUCT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { enum { @@ -50,17 +52,17 @@ template struct product_size_category template struct product_type { - typedef typename remove_all::type _Lhs; - typedef typename remove_all::type _Rhs; + typedef remove_all_t Lhs_; + typedef remove_all_t Rhs_; enum { - MaxRows = traits<_Lhs>::MaxRowsAtCompileTime, - Rows = traits<_Lhs>::RowsAtCompileTime, - MaxCols = traits<_Rhs>::MaxColsAtCompileTime, - Cols = traits<_Rhs>::ColsAtCompileTime, - MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime, - traits<_Rhs>::MaxRowsAtCompileTime), - Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime, - traits<_Rhs>::RowsAtCompileTime) + MaxRows = traits::MaxRowsAtCompileTime, + Rows = traits::RowsAtCompileTime, + MaxCols = traits::MaxColsAtCompileTime, + Cols = traits::ColsAtCompileTime, + MaxDepth = min_size_prefer_fixed(traits::MaxColsAtCompileTime, + traits::MaxRowsAtCompileTime), + Depth = min_size_prefer_fixed(traits::ColsAtCompileTime, + traits::RowsAtCompileTime) }; // the splitting into different lines of code here, introducing the _select enums and the typedef below, @@ -180,12 +182,13 @@ struct gemv_static_vector_if PacketSize = internal::packet_traits::size }; #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0 - internal::plain_array m_data; + internal::plain_array m_data; EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; } #else // Some architectures cannot align on the stack, // => let's manually enforce alignment by allocating more data and return the address of the first aligned element. - internal::plain_array m_data; + internal::plain_array m_data; EIGEN_STRONG_INLINE Scalar* data() { return ForceAlignment ? reinterpret_cast((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES) @@ -216,14 +219,13 @@ template<> struct gemv_dense_selector typedef typename Lhs::Scalar LhsScalar; typedef typename Rhs::Scalar RhsScalar; typedef typename Dest::Scalar ResScalar; - typedef typename Dest::RealScalar RealScalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef Map, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits::size)> MappedDest; + typedef Map, plain_enum_min(AlignedMax, internal::packet_traits::size)> MappedDest; ActualLhsType actualLhs = LhsBlasTraits::extract(lhs); ActualRhsType actualRhs = RhsBlasTraits::extract(rhs); @@ -231,7 +233,7 @@ template<> struct gemv_dense_selector ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); // make sure Dest is a compile-time vector type (bug 1166) - typedef typename conditional::type ActualDest; + typedef std::conditional_t ActualDest; enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 @@ -261,7 +263,7 @@ template<> struct gemv_dense_selector { gemv_static_vector_if static_dest; - const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); + const bool alphaIsCompatible = (!ComplexByReal) || (numext::is_exactly_zero(numext::imag(actualAlpha))); const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), @@ -314,10 +316,10 @@ template<> struct gemv_dense_selector typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; + typedef internal::remove_all_t ActualRhsTypeCleaned; - typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); - typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); + std::add_const_t actualLhs = LhsBlasTraits::extract(lhs); + std::add_const_t actualRhs = RhsBlasTraits::extract(rhs); ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); diff --git a/libs/eigen/Eigen/src/Core/GenericPacketMath.h b/libs/eigen/Eigen/src/Core/GenericPacketMath.h index cf677a1..af773dd 100644 --- a/libs/eigen/Eigen/src/Core/GenericPacketMath.h +++ b/libs/eigen/Eigen/src/Core/GenericPacketMath.h @@ -11,6 +11,8 @@ #ifndef EIGEN_GENERIC_PACKET_MATH_H #define EIGEN_GENERIC_PACKET_MATH_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -57,12 +59,14 @@ struct default_packet_traits HasMax = 1, HasConj = 1, HasSetLinear = 1, + HasSign = 1, HasBlend = 0, // This flag is used to indicate whether packet comparison is supported. // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true. HasCmp = 0, HasDiv = 0, + HasReciprocal = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -98,8 +102,7 @@ struct default_packet_traits HasRound = 0, HasRint = 0, HasFloor = 0, - HasCeil = 0, - HasSign = 0 + HasCeil = 0 }; }; @@ -160,7 +163,7 @@ struct eigen_packet_wrapper { EIGEN_ALWAYS_INLINE operator T&() { return m_val; } EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } - EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() = default; EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { m_val = v; @@ -176,7 +179,7 @@ struct eigen_packet_wrapper */ template struct is_scalar { - typedef typename unpacket_traits::type Scalar; + using Scalar = typename unpacket_traits::type; enum { value = internal::is_same::value }; @@ -217,6 +220,15 @@ padd(const Packet& a, const Packet& b) { return a+b; } template<> EIGEN_DEVICE_FUNC inline bool padd(const bool& a, const bool& b) { return a || b; } +/** \internal \returns a packet version of \a *from, (un-aligned masked add) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template EIGEN_DEVICE_FUNC inline +std::enable_if_t::masked_fpops_available, Packet> +padd(const Packet& a, const Packet& b, typename unpacket_traits::mask_t umask); + + /** \internal \returns a - b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet psub(const Packet& a, const Packet& b) { return a-b; } @@ -259,7 +271,7 @@ struct ptrue_impl { // have another option, since the scalar type requires initialization. template struct ptrue_impl::value && NumTraits::RequireInitialization>::type > { + std::enable_if_t::value && NumTraits::RequireInitialization> > { static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){ return T(1); } @@ -285,7 +297,7 @@ struct pzero_impl { // for zero may not consist of all-zero bits. template struct pzero_impl::value>::type> { + std::enable_if_t::value>> { static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { return T(0); } @@ -356,16 +368,16 @@ struct bytewise_bitwise_helper { EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return binary(a, b, bit_and()); } - EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return binary(a, b, bit_or()); } EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return binary(a, b, bit_xor()); } - EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return unary(a,bit_not()); } - + private: template EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) { @@ -398,8 +410,8 @@ struct bitwise_helper : public bytewise_bitwise_helper {}; // For integers or non-trivial scalars, use binary operators. template struct bitwise_helper::value && (NumTraits::IsInteger || NumTraits::RequireInitialization)>::type + typename std::enable_if_t< + is_scalar::value && (NumTraits::IsInteger || NumTraits::RequireInitialization)> > : public operator_bitwise_helper {}; /** \internal \returns the bitwise and of \a a and \a b */ @@ -441,7 +453,7 @@ struct pselect_impl { // For scalars, use ternary select. template struct pselect_impl::value>::type > { + std::enable_if_t::value> > { static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { return numext::equal_strict(mask, Packet(0)) ? b : a; } @@ -551,13 +563,13 @@ template EIGEN_DEVICE_FUNC inline Packet parg(const Packet& a) { using numext::arg; return arg(a); } -/** \internal \returns \a a logically shifted by N bits to the right */ +/** \internal \returns \a a arithmetically shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int parithmetic_shift_right(const int& a) { return a >> N; } template EIGEN_DEVICE_FUNC inline long int parithmetic_shift_right(const long int& a) { return a >> N; } -/** \internal \returns \a a arithmetically shifted by N bits to the right */ +/** \internal \returns \a a logically shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int plogical_shift_right(const int& a) { return static_cast(static_cast(a) >> N); } template EIGEN_DEVICE_FUNC inline long int @@ -594,20 +606,52 @@ pldexp(const Packet &a, const Packet &exponent) { template EIGEN_DEVICE_FUNC inline Packet pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } -/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ +/** \internal \returns a packet version of \a *from, from must be properly aligned */ template EIGEN_DEVICE_FUNC inline Packet pload(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns n elements of a packet version of \a *from, from must be properly aligned + * offset indicates the starting element in which to load and + * offset + n <= unpacket_traits::size + * All elements before offset and after the last element loaded will initialized with zero */ +template EIGEN_DEVICE_FUNC inline Packet +pload_partial(const typename unpacket_traits::type* from, const Index n, const Index offset = 0) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); + typedef typename unpacket_traits::type Scalar; + EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; + for (Index i = offset; i < numext::mini(n+offset,packet_size); i++) { + elements[i] = from[i-offset]; + } + return pload(elements); +} + /** \internal \returns a packet version of \a *from, (un-aligned load) */ template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns n elements of a packet version of \a *from, (un-aligned load) + * All elements after the last element loaded will initialized with zero */ +template EIGEN_DEVICE_FUNC inline Packet +ploadu_partial(const typename unpacket_traits::type* from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will read past end of packet"); + typedef typename unpacket_traits::type Scalar; + EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + elements[i] = from[i]; + } + return pload(elements); +} + /** \internal \returns a packet version of \a *from, (un-aligned masked load) * There is no generic implementation. We only have implementations for specialized * cases. Generic case should not be called. */ template EIGEN_DEVICE_FUNC inline -typename enable_if::masked_load_available, Packet>::type +std::enable_if_t::masked_load_available, Packet> ploadu(const typename unpacket_traits::type* from, typename unpacket_traits::mask_t umask); /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ @@ -692,28 +736,74 @@ peven_mask(const Packet& /*a*/) { } -/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ +/** \internal copy the packet \a from to \a *to, \a to must be properly aligned */ template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy n elements of the packet \a from to \a *to, \a to must be properly aligned + * offset indicates the starting element in which to store and + * offset + n <= unpacket_traits::size */ +template EIGEN_DEVICE_FUNC inline void pstore_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); + EIGEN_ALIGN_MAX Scalar elements[packet_size]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n,packet_size-offset); i++) { + to[i] = elements[i + offset]; + } +} + /** \internal copy the packet \a from to \a *to, (un-aligned store) */ template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */ +template EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will write past end of packet"); + EIGEN_ALIGN_MAX Scalar elements[packet_size]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + to[i] = elements[i]; + } +} + /** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) * There is no generic implementation. We only have implementations for specialized * cases. Generic case should not be called. */ template EIGEN_DEVICE_FUNC inline -typename enable_if::masked_store_available, void>::type +std::enable_if_t::masked_store_available, void> pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t umask); - template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) - { return ploadu(from); } +template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) +{ return ploadu(from); } - template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) - { pstore(to, from); } +template EIGEN_DEVICE_FUNC inline Packet pgather_partial(const Scalar* from, Index stride, const Index n) +{ + const Index packet_size = unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar elements[packet_size] = { Scalar(0) }; + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + elements[i] = from[i*stride]; + } + return pload(elements); +} + +template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) +{ pstore(to, from); } + +template EIGEN_DEVICE_FUNC inline void pscatter_partial(Scalar* to, const Packet& from, Index stride, const Index n) +{ + const Index packet_size = unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar elements[packet_size]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n,packet_size); i++) { + to[i*stride] = elements[i]; + } +} /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) @@ -807,20 +897,13 @@ Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2(const Packet& a) { typedef typename internal::unpacket_traits::type Scalar; - return pmul(pset1(Scalar(EIGEN_LOG2E)), plog(a)); + return pmul(pset1(Scalar(EIGEN_LOG2E)), plog(a)); } /** \internal \returns the square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt(const Packet& a) { return numext::sqrt(a); } -/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet prsqrt(const Packet& a) { - typedef typename internal::unpacket_traits::type Scalar; - return pdiv(pset1(Scalar(1)), psqrt(a)); -} - /** \internal \returns the rounded value of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pround(const Packet& a) { using numext::round; return round(a); } @@ -838,6 +921,24 @@ Packet print(const Packet& a) { using numext::rint; return rint(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +template +struct psign_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) { + return numext::sign(a); + } +}; + +/** \internal \returns the sign of \a a (coeff-wise) */ +template EIGEN_DEVICE_FUNC inline Packet +psign(const Packet& a) { + return psign_impl::run(a); +} + +template<> EIGEN_DEVICE_FUNC inline bool +psign(const bool& a) { + return a; +} + /** \internal \returns the first element of a packet */ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type @@ -849,7 +950,7 @@ pfirst(const Packet& a) * For packet-size smaller or equal to 4, this boils down to a noop. */ template -EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +EIGEN_DEVICE_FUNC inline std::conditional_t<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet> predux_half_dowto4(const Packet& a) { return a; } @@ -881,7 +982,7 @@ predux(const Packet& a) template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul( const Packet& a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul))); } @@ -889,14 +990,14 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul( template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( const Packet &a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( const Packet& a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); } @@ -904,14 +1005,14 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( const Packet &a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( const Packet& a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); } @@ -943,6 +1044,35 @@ template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ +// FMA instructions. +/** \internal \returns a * b + c (coeff-wise) */ +template +EIGEN_DEVICE_FUNC inline Packet pmadd(const Packet& a, const Packet& b, + const Packet& c) { + return padd(pmul(a, b), c); +} + +/** \internal \returns a * b - c (coeff-wise) */ +template +EIGEN_DEVICE_FUNC inline Packet pmsub(const Packet& a, const Packet& b, + const Packet& c) { + return psub(pmul(a, b), c); +} + +/** \internal \returns -(a * b) + c (coeff-wise) */ +template +EIGEN_DEVICE_FUNC inline Packet pnmadd(const Packet& a, const Packet& b, + const Packet& c) { + return padd(pnegate(pmul(a, b)), c); +} + +/** \internal \returns -(a * b) - c (coeff-wise) */ +template +EIGEN_DEVICE_FUNC inline Packet pnmsub(const Packet& a, const Packet& b, + const Packet& c) { + return psub(pnegate(pmul(a, b)), c); +} + /** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) template @@ -951,13 +1081,6 @@ inline void pstore1(typename unpacket_traits::type* to, const typename u pstore(to, pset1(a)); } -/** \internal \returns a * b + c (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pmadd(const Packet& a, - const Packet& b, - const Packet& c) -{ return padd(pmul(a, b),c); } - /** \internal \returns a packet version of \a *from. * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ template @@ -969,6 +1092,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_trai return ploadu(from); } +/** \internal \returns n elements of a packet version of \a *from. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpacket_traits::type* from, const Index n, const Index offset = 0) +{ + if(Alignment >= unpacket_traits::alignment) + return pload_partial(from, n, offset); + else + return ploadu_partial(from, n); +} + /** \internal copy the packet \a from to \a *to. * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ template @@ -980,6 +1114,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro pstoreu(to, from); } +/** \internal copy n elements of the packet \a from to \a *to. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0) +{ + if(Alignment >= unpacket_traits::alignment) + pstore_partial(to, from, n, offset); + else + pstoreu_partial(to, from, n); +} + /** \internal \returns a packet version of \a *from. * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the * hardware if available to speedup the loading of data that won't be modified @@ -1033,6 +1178,47 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } +/** \internal \returns 1 / a (coeff-wise) */ +template +EIGEN_DEVICE_FUNC inline Packet preciprocal(const Packet& a) { + using Scalar = typename unpacket_traits::type; + return pdiv(pset1(Scalar(1)), a); +} + +/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet prsqrt(const Packet& a) { + return preciprocal(psqrt(a)); +} + +template ::value, + bool IsInteger = NumTraits::type>::IsInteger> + struct psignbit_impl; +template +struct psignbit_impl { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return numext::signbit(a); } +}; +template +struct psignbit_impl { + // generic implementation if not specialized in PacketMath.h + // slower than arithmetic shift + typedef typename unpacket_traits::type Scalar; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Packet run(const Packet& a) { + const Packet cst_pos_one = pset1(Scalar(1)); + const Packet cst_neg_one = pset1(Scalar(-1)); + return pcmp_eq(por(pand(a, cst_neg_one), cst_pos_one), cst_neg_one); + } +}; +template +struct psignbit_impl { + // generic implementation for integer packets + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return pcmp_lt(a, pzero(a)); } +}; +/** \internal \returns the sign bit of \a a as a bitmask*/ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr Packet +psignbit(const Packet& a) { return psignbit_impl::run(a); } + } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/GlobalFunctions.h b/libs/eigen/Eigen/src/Core/GlobalFunctions.h index 629af94..18792cb 100644 --- a/libs/eigen/Eigen/src/Core/GlobalFunctions.h +++ b/libs/eigen/Eigen/src/Core/GlobalFunctions.h @@ -51,6 +51,8 @@ } \ }; +#include "./InternalHeaderCheck.h" + namespace Eigen { EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op,real part,\sa ArrayBase::real) @@ -66,11 +68,9 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) -#if EIGEN_HAS_CXX11_MATH EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh) -#endif EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) @@ -99,31 +99,31 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign) + template + using GlobalUnaryPowReturnType = std::enable_if_t< + !internal::is_arithmetic::Real>::value && + internal::is_arithmetic::Real>::value, + CwiseUnaryOp, const Derived> >; + /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent. - * - * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar). - * - * \sa ArrayBase::pow() - * - * \relates ArrayBase - */ + * + * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given + * expression (\c Derived::Scalar). + * + * \sa ArrayBase::pow() + * + * \relates ArrayBase + */ #ifdef EIGEN_PARSED_BY_DOXYGEN - template - inline const CwiseBinaryOp,Derived,Constant > - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent); + template + EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType pow( + const Eigen::ArrayBase& x, const ScalarExponent& exponent); #else - template - EIGEN_DEVICE_FUNC inline - EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( - const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,pow)) - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) - { - typedef typename internal::promote_scalar_arg::type PromotedExponent; - return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(), - typename internal::plain_constant_type::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op(exponent))); + template + EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType pow( + const Eigen::ArrayBase& x, const ScalarExponent& exponent) { + return GlobalUnaryPowReturnType( + x.derived(), internal::scalar_unary_pow_op(exponent)); } #endif @@ -168,10 +168,9 @@ namespace Eigen #else template EIGEN_DEVICE_FUNC inline - EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,pow)) + EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow) pow(const Scalar& x, const Eigen::ArrayBase& exponents) { typedef typename internal::promote_scalar_arg::type PromotedScalar; @@ -180,6 +179,25 @@ namespace Eigen } #endif + /** \returns an expression of the coefficient-wise atan2(\a x, \a y). \a x and \a y must be of the same type. + * + * This function computes the coefficient-wise atan2(). + * + * \sa ArrayBase::atan2() + * + * \relates ArrayBase + */ + template + inline const std::enable_if_t< + std::is_same::value, + Eigen::CwiseBinaryOp, const LhsDerived, const RhsDerived> + > + atan2(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) { + return Eigen::CwiseBinaryOp, const LhsDerived, const RhsDerived>( + x.derived(), + exponents.derived() + ); + } namespace internal { diff --git a/libs/eigen/Eigen/src/Core/IO.h b/libs/eigen/Eigen/src/Core/IO.h index e81c315..897d7b0 100644 --- a/libs/eigen/Eigen/src/Core/IO.h +++ b/libs/eigen/Eigen/src/Core/IO.h @@ -11,6 +11,8 @@ #ifndef EIGEN_IO_H #define EIGEN_IO_H +#include "./InternalHeaderCheck.h" + namespace Eigen { enum { DontAlignCols = 1 }; @@ -131,7 +133,6 @@ template std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt) { using internal::is_same; - using internal::conditional; if(_m.size() == 0) { @@ -141,22 +142,21 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& typename Derived::Nested m = _m; typedef typename Derived::Scalar Scalar; - typedef typename - conditional< + typedef std::conditional_t< is_same::value || is_same::value || is_same::value || is_same::value, int, - typename conditional< + std::conditional_t< is_same >::value || is_same >::value || is_same >::value || is_same >::value, std::complex, const Scalar& - >::type - >::type PrintType; + > + > PrintType; Index width = 0; diff --git a/libs/eigen/Eigen/src/Core/IndexedView.h b/libs/eigen/Eigen/src/Core/IndexedView.h index 0847625..f967301 100644 --- a/libs/eigen/Eigen/src/Core/IndexedView.h +++ b/libs/eigen/Eigen/src/Core/IndexedView.h @@ -10,6 +10,8 @@ #ifndef EIGEN_INDEXED_VIEW_H #define EIGEN_INDEXED_VIEW_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -21,8 +23,8 @@ struct traits > enum { RowsAtCompileTime = int(array_size::value), ColsAtCompileTime = int(array_size::value), - MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic, - MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic, + MaxRowsAtCompileTime = RowsAtCompileTime, + MaxColsAtCompileTime = ColsAtCompileTime, XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0, IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 @@ -40,10 +42,10 @@ struct traits > InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime, IsBlockAlike = InnerIncr==1 && OuterIncr==1, - IsInnerPannel = HasSameStorageOrderAsXprType && is_same,typename conditional::type>::value, + IsInnerPannel = HasSameStorageOrderAsXprType && is_same,std::conditional_t>::value, - InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr, - OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr, + InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic || InnerIncr==UndefinedIncr ? Dynamic : XprInnerStride * InnerIncr, + OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic || OuterIncr==UndefinedIncr ? Dynamic : XprOuterstride * OuterIncr, ReturnAsScalar = is_same::value && is_same::value, ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike, @@ -96,7 +98,7 @@ class IndexedViewImpl; * - decltype(ArrayXi::LinSpaced(...)) * - Any view/expressions of the previous types * - Eigen::ArithmeticSequence - * - Eigen::internal::AllRange (helper for Eigen::all) + * - Eigen::internal::AllRange (helper for Eigen::placeholders::all) * - Eigen::internal::SingleRange (helper for single index) * - etc. * @@ -114,7 +116,7 @@ public: EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView) typedef typename internal::ref_selector::non_const_type MatrixTypeNested; - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; template IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices) @@ -122,17 +124,17 @@ public: {} /** \returns number of rows */ - Index rows() const { return internal::size(m_rowIndices); } + Index rows() const { return internal::index_list_size(m_rowIndices); } /** \returns number of columns */ - Index cols() const { return internal::size(m_colIndices); } + Index cols() const { return internal::index_list_size(m_colIndices); } /** \returns the nested expression */ - const typename internal::remove_all::type& + const internal::remove_all_t& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ - typename internal::remove_reference::type& + std::remove_reference_t& nestedExpression() { return m_xpr; } /** \returns a const reference to the object storing/generating the row indices */ @@ -189,12 +191,16 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -204,6 +210,8 @@ struct unary_evaluator, IndexBased> EIGEN_STATIC_ASSERT_LVALUE(XprType) Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -212,6 +220,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -220,6 +230,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } diff --git a/libs/eigen/Eigen/src/Core/InternalHeaderCheck.h b/libs/eigen/Eigen/src/Core/InternalHeaderCheck.h new file mode 100644 index 0000000..1cea572 --- /dev/null +++ b/libs/eigen/Eigen/src/Core/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_CORE_MODULE_H +#error "Please include Eigen/Core instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/Core/Inverse.h b/libs/eigen/Eigen/src/Core/Inverse.h index c514438..9c70733 100644 --- a/libs/eigen/Eigen/src/Core/Inverse.h +++ b/libs/eigen/Eigen/src/Core/Inverse.h @@ -10,6 +10,8 @@ #ifndef EIGEN_INVERSE_H #define EIGEN_INVERSE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template class InverseImpl; @@ -46,9 +48,9 @@ public: typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename internal::ref_selector::type XprTypeNested; - typedef typename internal::remove_all::type XprTypeNestedCleaned; + typedef internal::remove_all_t XprTypeNestedCleaned; typedef typename internal::ref_selector::type Nested; - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr) : m_xpr(xpr) @@ -102,7 +104,7 @@ struct unary_evaluator > unary_evaluator(const InverseType& inv_xpr) : m_result(inv_xpr.rows(), inv_xpr.cols()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); internal::call_assignment_no_alias(m_result, inv_xpr); } diff --git a/libs/eigen/Eigen/src/Core/Map.h b/libs/eigen/Eigen/src/Core/Map.h index 218cc15..56d1ff8 100644 --- a/libs/eigen/Eigen/src/Core/Map.h +++ b/libs/eigen/Eigen/src/Core/Map.h @@ -11,6 +11,8 @@ #ifndef EIGEN_MAP_H #define EIGEN_MAP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -129,7 +131,6 @@ template class Ma explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType()) : Base(cast_to_pointer_type(dataPtr)), m_stride(stride) { - PlainObjectType::Base::_check_template_params(); } /** Constructor in the dynamic-size vector case. @@ -142,7 +143,6 @@ template class Ma inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType()) : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride) { - PlainObjectType::Base::_check_template_params(); } /** Constructor in the dynamic-size matrix case. @@ -156,7 +156,6 @@ template class Ma inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType()) : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride) { - PlainObjectType::Base::_check_template_params(); } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map) diff --git a/libs/eigen/Eigen/src/Core/MapBase.h b/libs/eigen/Eigen/src/Core/MapBase.h index d856447..bf8c163 100644 --- a/libs/eigen/Eigen/src/Core/MapBase.h +++ b/libs/eigen/Eigen/src/Core/MapBase.h @@ -15,6 +15,8 @@ EIGEN_STATIC_ASSERT((int(internal::evaluator::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \ YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT) +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup Core_Module @@ -51,11 +53,11 @@ template class MapBase typedef typename internal::traits::Scalar Scalar; typedef typename internal::packet_traits::type PacketScalar; typedef typename NumTraits::Real RealScalar; - typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar *, - const Scalar *>::type - PointerType; + typedef std::conditional_t< + bool(internal::is_lvalue::value), + Scalar *, + const Scalar *> + PointerType; using Base::derived; // using Base::RowsAtCompileTime; @@ -189,7 +191,7 @@ template class MapBase template EIGEN_DEVICE_FUNC - void checkSanity(typename internal::enable_if<(internal::traits::Alignment>0),void*>::type = 0) const + void checkSanity(std::enable_if_t<(internal::traits::Alignment>0),void*> = 0) const { #if EIGEN_MAX_ALIGN_BYTES>0 // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value: @@ -202,7 +204,7 @@ template class MapBase template EIGEN_DEVICE_FUNC - void checkSanity(typename internal::enable_if::Alignment==0,void*>::type = 0) const + void checkSanity(std::enable_if_t::Alignment==0,void*> = 0) const {} PointerType m_data; @@ -245,11 +247,11 @@ template class MapBase using Base::rowStride; using Base::colStride; - typedef typename internal::conditional< + typedef std::conditional_t< internal::is_lvalue::value, Scalar, const Scalar - >::type ScalarWithConstIfNotLvalue; + > ScalarWithConstIfNotLvalue; EIGEN_DEVICE_FUNC inline const Scalar* data() const { return this->m_data; } diff --git a/libs/eigen/Eigen/src/Core/MathFunctions.h b/libs/eigen/Eigen/src/Core/MathFunctions.h index 61b78f4..b194353 100644 --- a/libs/eigen/Eigen/src/Core/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/MathFunctions.h @@ -17,16 +17,9 @@ #define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L #define EIGEN_LN2 0.693147180559945309417232121458176568075500134360255254120680009493393621L -namespace Eigen { +#include "./InternalHeaderCheck.h" -// On WINCE, std::abs is defined for int only, so let's defined our own overloads: -// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too. -#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500 -long abs(long x) { return (labs(x)); } -double abs(double x) { return (fabs(x)); } -float abs(float x) { return (fabsf(x)); } -long double abs(long double x) { return (fabsl(x)); } -#endif +namespace Eigen { namespace internal { @@ -236,6 +229,63 @@ struct imag_ref_retval typedef typename NumTraits::Real & type; }; + +/**************************************************************************** +* Implementation of sign * +****************************************************************************/ +template::IsComplex!=0), + bool IsInteger = (NumTraits::IsInteger!=0)> +struct sign_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& a) + { + return Scalar( (a>Scalar(0)) - (a +struct sign_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& a) + { + return (std::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a +struct sign_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& a) + { + using real_type = typename NumTraits::Real; + real_type aa = std::abs(a); + if (aa==real_type(0)) + return Scalar(0); + aa = real_type(1)/aa; + return Scalar(a.real()*aa, a.imag()*aa ); + } +}; + +// The sign function for bool is the identity. +template<> +struct sign_impl +{ + EIGEN_DEVICE_FUNC + static inline bool run(const bool& a) + { + return a; + } +}; + +template +struct sign_retval +{ + typedef Scalar type; +}; + /**************************************************************************** * Implementation of conj * ****************************************************************************/ @@ -441,9 +491,9 @@ struct cast_impl // generating warnings on clang. Here we explicitly cast the real component. template struct cast_impl::IsComplex && NumTraits::IsComplex - >::type> + >> { EIGEN_DEVICE_FUNC static inline NewType run(const OldType& x) @@ -469,57 +519,16 @@ inline NewType cast(const OldType& x) template struct round_impl { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) -#if EIGEN_HAS_CXX11_MATH EIGEN_USING_STD(round); -#endif return Scalar(round(x)); } }; -#if !EIGEN_HAS_CXX11_MATH -#if EIGEN_HAS_C99_MATH -// Use ::roundf for float. -template<> -struct round_impl { - EIGEN_DEVICE_FUNC - static inline float run(const float& x) - { - return ::roundf(x); - } -}; -#else -template -struct round_using_floor_ceil_impl -{ - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - // Without C99 round/roundf, resort to floor/ceil. - EIGEN_USING_STD(floor); - EIGEN_USING_STD(ceil); - // If not enough precision to resolve a decimal at all, return the input. - // Otherwise, adding 0.5 can trigger an increment by 1. - const Scalar limit = Scalar(1ull << (NumTraits::digits() - 1)); - if (x >= limit || x <= -limit) { - return x; - } - return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5))); - } -}; - -template<> -struct round_impl : round_using_floor_ceil_impl {}; - -template<> -struct round_impl : round_using_floor_ceil_impl {}; -#endif // EIGEN_HAS_C99_MATH -#endif // !EIGEN_HAS_CXX11_MATH - template struct round_retval { @@ -532,36 +541,16 @@ struct round_retval template struct rint_impl { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) -#if EIGEN_HAS_CXX11_MATH - EIGEN_USING_STD(rint); -#endif + EIGEN_USING_STD(rint); return rint(x); } }; -#if !EIGEN_HAS_CXX11_MATH -template<> -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline double run(const double& x) - { - return ::rint(x); - } -}; -template<> -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline float run(const float& x) - { - return ::rintf(x); - } -}; -#endif - template struct rint_retval { @@ -574,7 +563,7 @@ struct rint_retval // Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs. // This seems to be fixed in VS 2019. -#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920) +#if (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920) // std::arg is only defined for types of std::complex, or integer types or float/double/long double template::IsComplex || is_integral::value @@ -675,11 +664,7 @@ struct expm1_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - #if EIGEN_HAS_CXX11_MATH using std::expm1; - #else - using std_fallback::expm1; - #endif return expm1(x); } }; @@ -736,14 +721,11 @@ namespace std_fallback { template struct log1p_impl { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - #if EIGEN_HAS_CXX11_MATH using std::log1p; - #else - using std_fallback::log1p; - #endif return log1p(x); } }; @@ -751,9 +733,10 @@ struct log1p_impl { // Specialization for complex types that are not supported by std::log1p. template struct log1p_impl > { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + EIGEN_DEVICE_FUNC static inline std::complex run( const std::complex& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) return std_fallback::log1p(x); } }; @@ -893,7 +876,7 @@ struct random_default_impl // ScalarX is the widest of ScalarU and unsigned int. // We'll deal only with ScalarX and unsigned int below thus avoiding signed // types and arithmetic and signed overflows (which are undefined behavior). - typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX; + typedef std::conditional_t<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned> ScalarX; // The following difference doesn't overflow, provided our integer types are two's // complement and have the same number of padding bits in signed and unsigned variants. // This is the case in most modern implementations of C++. @@ -918,8 +901,8 @@ struct random_default_impl #else enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value, scalar_bits = sizeof(Scalar) * CHAR_BIT, - shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)), - offset = NumTraits::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0 + shift = plain_enum_max(0, int(rand_bits) - int(scalar_bits)), + offset = NumTraits::IsSigned ? (1 << (plain_enum_min(rand_bits, scalar_bits)-1)) : 0 }; return Scalar((std::rand() >> shift) - offset); #endif @@ -956,7 +939,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() // Implementation of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. -#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) +#if (!(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC) || (EIGEN_COMP_CLANG) #define EIGEN_USE_STD_FPCLASSIFY 1 #else #define EIGEN_USE_STD_FPCLASSIFY 0 @@ -964,22 +947,22 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() template EIGEN_DEVICE_FUNC -typename internal::enable_if::value,bool>::type +std::enable_if_t::value,bool> isnan_impl(const T&) { return false; } template EIGEN_DEVICE_FUNC -typename internal::enable_if::value,bool>::type +std::enable_if_t::value,bool> isinf_impl(const T&) { return false; } template EIGEN_DEVICE_FUNC -typename internal::enable_if::value,bool>::type +std::enable_if_t::value,bool> isfinite_impl(const T&) { return true; } template EIGEN_DEVICE_FUNC -typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type +std::enable_if_t<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool> isfinite_impl(const T& x) { #if defined(EIGEN_GPU_COMPILE_PHASE) @@ -994,7 +977,7 @@ isfinite_impl(const T& x) template EIGEN_DEVICE_FUNC -typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type +std::enable_if_t<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool> isinf_impl(const T& x) { #if defined(EIGEN_GPU_COMPILE_PHASE) @@ -1009,7 +992,7 @@ isinf_impl(const T& x) template EIGEN_DEVICE_FUNC -typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type +std::enable_if_t<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool> isnan_impl(const T& x) { #if defined(EIGEN_GPU_COMPILE_PHASE) @@ -1042,7 +1025,7 @@ EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_ms #elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC) -#if EIGEN_GNUC_AT_LEAST(5,0) +#if EIGEN_COMP_GNUC #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only"))) #else // NOTE the inline qualifier and noinline attribute are both needed: the former is to avoid linking issue (duplicate symbol), @@ -1234,7 +1217,7 @@ inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x) template EIGEN_DEVICE_FUNC -inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x) +inline internal::add_const_on_value_type_t< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) > real_ref(const Scalar& x) { return internal::real_ref_impl::run(x); } @@ -1262,7 +1245,7 @@ inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x) template EIGEN_DEVICE_FUNC -inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x) +inline internal::add_const_on_value_type_t< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) > imag_ref(const Scalar& x) { return internal::imag_ref_impl::run(x); } @@ -1281,6 +1264,13 @@ inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x) return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x); } +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(sign, Scalar) sign(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(sign, Scalar)::run(x); +} + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) @@ -1505,7 +1495,7 @@ double log(const double &x) { return ::log(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type +std::enable_if_t::IsSigned || NumTraits::IsComplex,typename NumTraits::Real> abs(const T &x) { EIGEN_USING_STD(abs); return abs(x); @@ -1513,7 +1503,7 @@ abs(const T &x) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename internal::enable_if::IsSigned || NumTraits::IsComplex),typename NumTraits::Real>::type +std::enable_if_t::IsSigned || NumTraits::IsComplex),typename NumTraits::Real> abs(const T &x) { return x; } @@ -1541,6 +1531,37 @@ double abs(const std::complex& x) { } #endif +template ::IsInteger, bool IsSigned = NumTraits::IsSigned> +struct signbit_impl; +template +struct signbit_impl { + static constexpr size_t Size = sizeof(Scalar); + static constexpr size_t Shift = (CHAR_BIT * Size) - 1; + using intSize_t = typename get_integer_by_size::signed_type; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Scalar run(const Scalar& x) { + intSize_t a = bit_cast(x); + a = a >> Shift; + Scalar result = bit_cast(a); + return result; + } +}; +template +struct signbit_impl { + static constexpr size_t Size = sizeof(Scalar); + static constexpr size_t Shift = (CHAR_BIT * Size) - 1; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar run(const Scalar& x) { return x >> Shift; } +}; +template +struct signbit_impl { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar run(const Scalar& ) { + return Scalar(0); + } +}; +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar signbit(const Scalar& x) { + return signbit_impl::run(x); +} + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T &x) { @@ -1659,14 +1680,12 @@ T acos(const T &x) { return acos(x); } -#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T &x) { EIGEN_USING_STD(acosh); return static_cast(acosh(x)); } -#endif #if defined(SYCL_DEVICE_ONLY) SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos) @@ -1688,14 +1707,12 @@ T asin(const T &x) { return asin(x); } -#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T &x) { EIGEN_USING_STD(asinh); return static_cast(asinh(x)); } -#endif #if defined(SYCL_DEVICE_ONLY) SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin) @@ -1717,14 +1734,12 @@ T atan(const T &x) { return static_cast(atan(x)); } -#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T &x) { EIGEN_USING_STD(atanh); return static_cast(atanh(x)); } -#endif #if defined(SYCL_DEVICE_ONLY) SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan) @@ -2006,9 +2021,10 @@ namespace internal { // Specialization for complex types that are not supported by std::expm1. template struct expm1_impl > { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + EIGEN_DEVICE_FUNC static inline std::complex run( const std::complex& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) RealScalar xr = x.real(); RealScalar xi = x.imag(); // expm1(z) = exp(z) - 1 diff --git a/libs/eigen/Eigen/src/Core/MathFunctionsImpl.h b/libs/eigen/Eigen/src/Core/MathFunctionsImpl.h index 4eaaaa7..642e5d6 100644 --- a/libs/eigen/Eigen/src/Core/MathFunctionsImpl.h +++ b/libs/eigen/Eigen/src/Core/MathFunctionsImpl.h @@ -11,17 +11,152 @@ #ifndef EIGEN_MATHFUNCTIONSIMPL_H #define EIGEN_MATHFUNCTIONSIMPL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { +/** \internal Fast reciprocal using Newton-Raphson's method. + + Preconditions: + 1. The starting guess provided in approx_a_recip must have at least half + the leading mantissa bits in the correct result, such that a single + Newton-Raphson step is sufficient to get within 1-2 ulps of the currect + result. + 2. If a is zero, approx_a_recip must be infinite with the same sign as a. + 3. If a is infinite, approx_a_recip must be zero with the same sign as a. + + If the preconditions are satisfied, which they are for for the _*_rcp_ps + instructions on x86, the result has a maximum relative error of 2 ulps, + and correctly handles reciprocals of zero, infinity, and NaN. +*/ +template +struct generic_reciprocal_newton_step { + static_assert(Steps > 0, "Steps must be at least 1."); + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet + run(const Packet& a, const Packet& approx_a_recip) { + using Scalar = typename unpacket_traits::type; + const Packet two = pset1(Scalar(2)); + // Refine the approximation using one Newton-Raphson step: + // x_{i} = x_{i-1} * (2 - a * x_{i-1}) + const Packet x = + generic_reciprocal_newton_step::run(a, approx_a_recip); + const Packet tmp = pnmadd(a, x, two); + // If tmp is NaN, it means that a is either +/-0 or +/-Inf. + // In this case return the approximation directly. + const Packet is_not_nan = pcmp_eq(tmp, tmp); + return pselect(is_not_nan, pmul(x, tmp), x); + } +}; + +template +struct generic_reciprocal_newton_step { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet + run(const Packet& /*unused*/, const Packet& approx_rsqrt) { + return approx_rsqrt; + } +}; + + +/** \internal Fast reciprocal sqrt using Newton-Raphson's method. + + Preconditions: + 1. The starting guess provided in approx_a_recip must have at least half + the leading mantissa bits in the correct result, such that a single + Newton-Raphson step is sufficient to get within 1-2 ulps of the currect + result. + 2. If a is zero, approx_a_recip must be infinite with the same sign as a. + 3. If a is infinite, approx_a_recip must be zero with the same sign as a. + + If the preconditions are satisfied, which they are for for the _*_rcp_ps + instructions on x86, the result has a maximum relative error of 2 ulps, + and correctly handles zero, infinity, and NaN. Positive denormals are + treated as zero. +*/ +template +struct generic_rsqrt_newton_step { + static_assert(Steps > 0, "Steps must be at least 1."); + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet + run(const Packet& a, const Packet& approx_rsqrt) { + using Scalar = typename unpacket_traits::type; + const Packet one_point_five = pset1(Scalar(1.5)); + const Packet minus_half = pset1(Scalar(-0.5)); + + // Refine the approximation using one Newton-Raphson step: + // x_{n+1} = x_n * (1.5 + (-0.5 * x_n) * (a * x_n)). + // The approximation is expressed this way to avoid over/under-flows. + Packet x_newton = pmul(approx_rsqrt, pmadd(pmul(minus_half, approx_rsqrt), pmul(a, approx_rsqrt), one_point_five)); + for (int step = 1; step < Steps; ++step) { + x_newton = pmul(x_newton, pmadd(pmul(minus_half, x_newton), pmul(a, x_newton), one_point_five)); + } + + // If approx_rsqrt is 0 or +/-inf, we should return it as is. Note: + // on intel, approx_rsqrt can be inf for small denormal values. + const Packet return_approx = por(pcmp_eq(approx_rsqrt, pzero(a)), + pcmp_eq(pabs(approx_rsqrt), pset1(NumTraits::infinity()))); + return pselect(return_approx, approx_rsqrt, x_newton); + } +}; + +template +struct generic_rsqrt_newton_step { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet + run(const Packet& /*unused*/, const Packet& approx_rsqrt) { + return approx_rsqrt; + } +}; + + +/** \internal Fast sqrt using Newton-Raphson's method. + + Preconditions: + 1. The starting guess for the reciprocal sqrt provided in approx_rsqrt must + have at least half the leading mantissa bits in the correct result, such + that a single Newton-Raphson step is sufficient to get within 1-2 ulps of + the currect result. + 2. If a is zero, approx_rsqrt must be infinite. + 3. If a is infinite, approx_rsqrt must be zero. + + If the preconditions are satisfied, which they are for for the _*_rsqrt_ps + instructions on x86, the result has a maximum relative error of 2 ulps, + and correctly handles zero and infinity, and NaN. Positive denormal inputs + are treated as zero. +*/ +template +struct generic_sqrt_newton_step { + static_assert(Steps > 0, "Steps must be at least 1."); + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet + run(const Packet& a, const Packet& approx_rsqrt) { + using Scalar = typename unpacket_traits::type; + const Packet one_point_five = pset1(Scalar(1.5)); + const Packet minus_half = pset1(Scalar(-0.5)); + // If a is inf or zero, return a directly. + const Packet inf_mask = pcmp_eq(a, pset1(NumTraits::infinity())); + const Packet return_a = por(pcmp_eq(a, pzero(a)), inf_mask); + // Do a single step of Newton's iteration for reciprocal square root: + // x_{n+1} = x_n * (1.5 + (-0.5 * x_n) * (a * x_n))). + // The Newton's step is computed this way to avoid over/under-flows. + Packet rsqrt = pmul(approx_rsqrt, pmadd(pmul(minus_half, approx_rsqrt), pmul(a, approx_rsqrt), one_point_five)); + for (int step = 1; step < Steps; ++step) { + rsqrt = pmul(rsqrt, pmadd(pmul(minus_half, rsqrt), pmul(a, rsqrt), one_point_five)); + } + + // Return sqrt(x) = x * rsqrt(x) for non-zero finite positive arguments. + // Return a itself for 0 or +inf, NaN for negative arguments. + return pselect(return_a, a, pmul(a, rsqrt)); + } +}; + /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which is accurate up to a couple of ulps in the (approximate) range [-8, 8], outside of which tanh(x) = +/-1 in single precision. The input is clamped to the range [-c, c]. The value c is chosen as the smallest value where the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004] - the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero. + the approximation tanh(x) ~= x is used for better accuracy as x tends to zero. This implementation works on both scalars and packets. */ @@ -88,7 +223,7 @@ RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) EIGEN_USING_STD(sqrt); RealScalar p, qp; p = numext::maxi(x,y); - if(p==RealScalar(0)) return RealScalar(0); + if(numext::is_exactly_zero(p)) return RealScalar(0); qp = numext::mini(y,x) / p; return p * sqrt(RealScalar(1) + qp*qp); } @@ -138,8 +273,8 @@ EIGEN_DEVICE_FUNC std::complex complex_sqrt(const std::complex& z) { return (numext::isinf)(y) ? std::complex(NumTraits::infinity(), y) - : x == zero ? std::complex(w, y < zero ? -w : w) - : x > zero ? std::complex(w, y / (2 * w)) + : numext::is_exactly_zero(x) ? std::complex(w, y < zero ? -w : w) + : x > zero ? std::complex(w, y / (2 * w)) : std::complex(numext::abs(y) / (2 * w), y < zero ? -w : w ); } @@ -177,10 +312,10 @@ EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& z) { const T woz = w / abs_z; // Corner cases consistent with 1/sqrt(z) on gcc/clang. return - abs_z == zero ? std::complex(NumTraits::infinity(), NumTraits::quiet_NaN()) - : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex(zero, zero) - : x == zero ? std::complex(woz, y < zero ? woz : -woz) - : x > zero ? std::complex(woz, -y / (2 * w * abs_z)) + numext::is_exactly_zero(abs_z) ? std::complex(NumTraits::infinity(), NumTraits::quiet_NaN()) + : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex(zero, zero) + : numext::is_exactly_zero(x) ? std::complex(woz, y < zero ? woz : -woz) + : x > zero ? std::complex(woz, -y / (2 * w * abs_z)) : std::complex(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz ); } diff --git a/libs/eigen/Eigen/src/Core/Matrix.h b/libs/eigen/Eigen/src/Core/Matrix.h index f0e59a9..c7747f1 100644 --- a/libs/eigen/Eigen/src/Core/Matrix.h +++ b/libs/eigen/Eigen/src/Core/Matrix.h @@ -11,37 +11,39 @@ #ifndef EIGEN_MATRIX_H #define EIGEN_MATRIX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template -struct traits > +template +struct traits > { private: - enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; - typedef typename find_best_packet<_Scalar,size>::type PacketScalar; + constexpr static int size = internal::size_at_compile_time(Rows_,Cols_); + typedef typename find_best_packet::type PacketScalar; enum { - row_major_bit = _Options&RowMajor ? RowMajorBit : 0, - is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, - max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, - default_alignment = compute_default_alignment<_Scalar,max_size>::value, - actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, + row_major_bit = Options_&RowMajor ? RowMajorBit : 0, + is_dynamic_size_storage = MaxRows_==Dynamic || MaxCols_==Dynamic, + max_size = is_dynamic_size_storage ? Dynamic : MaxRows_*MaxCols_, + default_alignment = compute_default_alignment::value, + actual_alignment = ((Options_&DontAlign)==0) ? default_alignment : 0, required_alignment = unpacket_traits::alignment, - packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0 + packet_access_bit = (packet_traits::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0 }; public: - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Dense StorageKind; typedef Eigen::Index StorageIndex; typedef MatrixXpr XprKind; enum { - RowsAtCompileTime = _Rows, - ColsAtCompileTime = _Cols, - MaxRowsAtCompileTime = _MaxRows, - MaxColsAtCompileTime = _MaxCols, - Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, - Options = _Options, + RowsAtCompileTime = Rows_, + ColsAtCompileTime = Cols_, + MaxRowsAtCompileTime = MaxRows_, + MaxColsAtCompileTime = MaxCols_, + Flags = compute_matrix_flags(Options_), + Options = Options_, InnerStrideAtCompileTime = 1, OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, @@ -63,18 +65,18 @@ public: * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note"). * * The first three template parameters are required: - * \tparam _Scalar Numeric type, e.g. float, double, int or std::complex. + * \tparam Scalar_ Numeric type, e.g. float, double, int or std::complex. * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). - * \tparam _Rows Number of rows, or \b Dynamic - * \tparam _Cols Number of columns, or \b Dynamic + * \tparam Rows_ Number of rows, or \b Dynamic + * \tparam Cols_ Number of columns, or \b Dynamic * * The remaining template parameters are optional -- in most cases you don't have to worry about them. - * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of either + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either * \b #AutoAlign or \b #DontAlign. * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required * for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size. - * \tparam _MaxRows Maximum number of rows. Defaults to \a _Rows (\ref maxrows "note"). - * \tparam _MaxCols Maximum number of columns. Defaults to \a _Cols (\ref maxrows "note"). + * \tparam MaxRows_ Maximum number of rows. Defaults to \a Rows_ (\ref maxrows "note"). + * \tparam MaxCols_ Maximum number of columns. Defaults to \a Cols_ (\ref maxrows "note"). * * Eigen provides a number of typedefs covering the usual cases. Here are some examples: * @@ -128,12 +130,12 @@ public: * Note that \em dense matrices, be they Fixed-size or Dynamic-size, do not expand dynamically in the sense of a std::map. * If you want this behavior, see the Sparse module. * - *
\anchor maxrows _MaxRows and _MaxCols:
+ *
\anchor maxrows MaxRows_ and MaxCols_:
*
In most cases, one just leaves these parameters to the default values. * These parameters mean the maximum size of rows and columns that the matrix may have. They are useful in cases * when the exact numbers of rows and columns are not known are compile-time, but it is known at compile-time that they cannot - * exceed a certain value. This happens when taking dynamic-size blocks inside fixed-size matrices: in this case _MaxRows and _MaxCols - * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.
+ * exceed a certain value. This happens when taking dynamic-size blocks inside fixed-size matrices: in this case MaxRows_ and MaxCols_ + * are the dimensions of the original matrix, while Rows_ and Cols_ are Dynamic. * * * ABI and storage layout @@ -174,9 +176,9 @@ public: * \ref TopicStorageOrders */ -template +template class Matrix - : public PlainObjectBase > + : public PlainObjectBase > { public: @@ -185,7 +187,7 @@ class Matrix */ typedef PlainObjectBase Base; - enum { Options = _Options }; + enum { Options = Options_ }; EIGEN_DENSE_PUBLIC_INTERFACE(Matrix) @@ -258,7 +260,6 @@ class Matrix EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix() : Base() { - Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } @@ -266,24 +267,18 @@ class Matrix EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(internal::constructor_without_unaligned_array_assert) : Base(internal::constructor_without_unaligned_array_assert()) - { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } -#if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) - : Base(std::move(other)) - { - Base::_check_template_params(); - } + : Base(std::move(other)) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { Base::operator=(std::move(other)); return *this; } -#endif -#if EIGEN_HAS_CXX11 /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&... args) * * Example: \include Matrix_variadic_ctor_cxx11.cpp @@ -317,9 +312,9 @@ class Matrix * * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} -#endif // end EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC explicit constexpr EIGEN_STRONG_INLINE Matrix( + const std::initializer_list>& list) + : Base(list) {} #ifndef EIGEN_PARSED_BY_DOXYGEN @@ -328,7 +323,6 @@ class Matrix EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(const T& x) { - Base::_check_template_params(); Base::template _init1(x); } @@ -336,7 +330,6 @@ class Matrix EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) { - Base::_check_template_params(); Base::template _init2(x, y); } @@ -388,7 +381,6 @@ class Matrix EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) { - Base::_check_template_params(); EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3) m_storage.data()[0] = x; m_storage.data()[1] = y; @@ -400,7 +392,6 @@ class Matrix EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) { - Base::_check_template_params(); EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4) m_storage.data()[0] = x; m_storage.data()[1] = y; @@ -480,16 +471,21 @@ class Matrix #define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix) \ /** \ingroup matrixtypedefs */ \ +/** \brief `Size`×`Size` matrix of type `Type`. */ \ typedef Matrix Matrix##SizeSuffix##TypeSuffix; \ /** \ingroup matrixtypedefs */ \ +/** \brief `Size`×`1` vector of type `Type`. */ \ typedef Matrix Vector##SizeSuffix##TypeSuffix; \ /** \ingroup matrixtypedefs */ \ +/** \brief `1`×`Size` vector of type `Type`. */ \ typedef Matrix RowVector##SizeSuffix##TypeSuffix; #define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size) \ /** \ingroup matrixtypedefs */ \ +/** \brief `Size`×`Dynamic` matrix of type `Type`. */ \ typedef Matrix Matrix##Size##X##TypeSuffix; \ /** \ingroup matrixtypedefs */ \ +/** \brief `Dynamic`×`Size` matrix of type `Type`. */ \ typedef Matrix Matrix##X##Size##TypeSuffix; #define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \ @@ -511,30 +507,28 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS -#if EIGEN_HAS_CXX11 - -#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Matrix##SizeSuffix = Matrix; \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Vector##SizeSuffix = Matrix; \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ +#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 `Size`×`Size` matrix of type `Type`.*/ \ +template \ +using Matrix##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 `Size`×`1` vector of type `Type`.*/ \ +template \ +using Vector##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 `1`×`Size` vector of type `Type`.*/ \ +template \ using RowVector##SizeSuffix = Matrix; -#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Matrix##Size##X = Matrix; \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ +#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 `Size`×`Dynamic` matrix of type `Type` */ \ +template \ +using Matrix##Size##X = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 `Dynamic`×`Size` matrix of type `Type`. */ \ +template \ using Matrix##X##Size = Matrix; EIGEN_MAKE_TYPEDEFS(2, 2) @@ -546,20 +540,18 @@ EIGEN_MAKE_FIXED_TYPEDEFS(3) EIGEN_MAKE_FIXED_TYPEDEFS(4) /** \ingroup matrixtypedefs - * \brief \cpp11 */ + * \brief \cpp11 `Size`×`1` vector of type `Type`. */ template using Vector = Matrix; /** \ingroup matrixtypedefs - * \brief \cpp11 */ + * \brief \cpp11 `1`×`Size` vector of type `Type`. */ template using RowVector = Matrix; #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS -#endif // EIGEN_HAS_CXX11 - } // end namespace Eigen #endif // EIGEN_MATRIX_H diff --git a/libs/eigen/Eigen/src/Core/MatrixBase.h b/libs/eigen/Eigen/src/Core/MatrixBase.h index 45c3a59..ea2178f 100644 --- a/libs/eigen/Eigen/src/Core/MatrixBase.h +++ b/libs/eigen/Eigen/src/Core/MatrixBase.h @@ -11,6 +11,8 @@ #ifndef EIGEN_MATRIXBASE_H #define EIGEN_MATRIXBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class MatrixBase @@ -92,8 +94,8 @@ template class MatrixBase #ifndef EIGEN_PARSED_BY_DOXYGEN /** type of the equivalent square matrix */ - typedef Matrix SquareMatrixType; + typedef Matrix SquareMatrixType; #endif // not EIGEN_PARSED_BY_DOXYGEN /** \returns the size of the main diagonal, which is min(rows(),cols()). @@ -107,10 +109,10 @@ template class MatrixBase /** \internal Represents a matrix with all coefficients equal to one another*/ typedef CwiseNullaryOp,PlainObject> ConstantReturnType; /** \internal the return type of MatrixBase::adjoint() */ - typedef typename internal::conditional::IsComplex, - CwiseUnaryOp, ConstTransposeReturnType>, - ConstTransposeReturnType - >::type AdjointReturnType; + typedef std::conditional_t::IsComplex, + CwiseUnaryOp, ConstTransposeReturnType>, + ConstTransposeReturnType + > AdjointReturnType; /** \internal Return type of eigenvalues() */ typedef Matrix, internal::traits::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType; /** \internal the return type of identity */ @@ -184,6 +186,11 @@ template class MatrixBase const Product operator*(const DiagonalBase &diagonal) const; + template + EIGEN_DEVICE_FUNC + const Product + operator*(const SkewSymmetricBase &skew) const; + template EIGEN_DEVICE_FUNC typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType @@ -206,28 +213,22 @@ template class MatrixBase EIGEN_DEVICE_FUNC DiagonalReturnType diagonal(); - typedef typename internal::add_const >::type ConstDiagonalReturnType; + typedef Diagonal ConstDiagonalReturnType; EIGEN_DEVICE_FUNC - ConstDiagonalReturnType diagonal() const; - - template struct DiagonalIndexReturnType { typedef Diagonal Type; }; - template struct ConstDiagonalIndexReturnType { typedef const Diagonal Type; }; + const ConstDiagonalReturnType diagonal() const; template EIGEN_DEVICE_FUNC - typename DiagonalIndexReturnType::Type diagonal(); + Diagonal diagonal(); template EIGEN_DEVICE_FUNC - typename ConstDiagonalIndexReturnType::Type diagonal() const; - - typedef Diagonal DiagonalDynamicIndexReturnType; - typedef typename internal::add_const >::type ConstDiagonalDynamicIndexReturnType; + const Diagonal diagonal() const; EIGEN_DEVICE_FUNC - DiagonalDynamicIndexReturnType diagonal(Index index); + Diagonal diagonal(Index index); EIGEN_DEVICE_FUNC - ConstDiagonalDynamicIndexReturnType diagonal(Index index) const; + const Diagonal diagonal(Index index) const; template struct TriangularViewReturnType { typedef TriangularView Type; }; template struct ConstTriangularViewReturnType { typedef const TriangularView Type; }; @@ -263,6 +264,8 @@ template class MatrixBase EIGEN_DEVICE_FUNC const DiagonalWrapper asDiagonal() const; const PermutationWrapper asPermutation() const; + EIGEN_DEVICE_FUNC + const SkewSymmetricWrapper asSkewSymmetric() const; EIGEN_DEVICE_FUNC Derived& setIdentity(); @@ -277,6 +280,8 @@ template class MatrixBase bool isUpperTriangular(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isLowerTriangular(const RealScalar& prec = NumTraits::dummy_precision()) const; + bool isSkewSymmetric(const RealScalar& prec = NumTraits::dummy_precision()) const; + template bool isOrthogonal(const MatrixBase& other, const RealScalar& prec = NumTraits::dummy_precision()) const; @@ -368,25 +373,23 @@ template class MatrixBase /////////// SVD module /////////// - inline JacobiSVD jacobiSvd(unsigned int computationOptions = 0) const; - inline BDCSVD bdcSvd(unsigned int computationOptions = 0) const; + template + inline JacobiSVD jacobiSvd() const; + template + EIGEN_DEPRECATED + inline JacobiSVD jacobiSvd(unsigned int computationOptions) const; + + template + inline BDCSVD bdcSvd() const; + template + EIGEN_DEPRECATED + inline BDCSVD bdcSvd(unsigned int computationOptions) const; /////////// Geometry module /////////// - #ifndef EIGEN_PARSED_BY_DOXYGEN - /// \internal helper struct to form the return type of the cross product - template struct cross_product_return_type { - typedef typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType Scalar; - typedef Matrix type; - }; - #endif // EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC -#ifndef EIGEN_PARSED_BY_DOXYGEN - inline typename cross_product_return_type::type -#else - inline PlainObject -#endif + inline typename internal::cross_impl::return_type cross(const MatrixBase& other) const; template @@ -468,11 +471,9 @@ template class MatrixBase const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) -#if EIGEN_HAS_CXX11_MATH EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine) -#endif EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) diff --git a/libs/eigen/Eigen/src/Core/NestByValue.h b/libs/eigen/Eigen/src/Core/NestByValue.h index b427576..311cb5a 100644 --- a/libs/eigen/Eigen/src/Core/NestByValue.h +++ b/libs/eigen/Eigen/src/Core/NestByValue.h @@ -11,6 +11,8 @@ #ifndef EIGEN_NESTBYVALUE_H #define EIGEN_NESTBYVALUE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -41,6 +43,8 @@ template class NestByValue public: typedef typename internal::dense_xpr_base::type Base; + static constexpr bool HasDirectAccess = internal::has_direct_access::ret; + EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue) EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {} @@ -52,6 +56,18 @@ template class NestByValue EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; } + EIGEN_DEVICE_FUNC typename std::enable_if::type data() const { + return m_expression.data(); + } + + EIGEN_DEVICE_FUNC typename std::enable_if::type innerStride() const { + return m_expression.innerStride(); + } + + EIGEN_DEVICE_FUNC typename std::enable_if::type outerStride() const { + return m_expression.outerStride(); + } + protected: const ExpressionType m_expression; }; diff --git a/libs/eigen/Eigen/src/Core/NoAlias.h b/libs/eigen/Eigen/src/Core/NoAlias.h index 570283d..09c0aac 100644 --- a/libs/eigen/Eigen/src/Core/NoAlias.h +++ b/libs/eigen/Eigen/src/Core/NoAlias.h @@ -10,6 +10,8 @@ #ifndef EIGEN_NOALIAS_H #define EIGEN_NOALIAS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class NoAlias diff --git a/libs/eigen/Eigen/src/Core/NumTraits.h b/libs/eigen/Eigen/src/Core/NumTraits.h index 72eac5a..53362ef 100644 --- a/libs/eigen/Eigen/src/Core/NumTraits.h +++ b/libs/eigen/Eigen/src/Core/NumTraits.h @@ -10,6 +10,8 @@ #ifndef EIGEN_NUMTRAITS_H #define EIGEN_NUMTRAITS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -61,10 +63,10 @@ struct default_digits_impl // Floating point { EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { - using std::log; + using std::log2; using std::ceil; typedef typename NumTraits::Real Real; - return int(ceil(-log(NumTraits::epsilon())/log(static_cast(2)))); + return int(ceil(-log2(NumTraits::epsilon()))); } }; @@ -83,17 +85,17 @@ namespace numext { // TODO: Replace by std::bit_cast (available in C++20) template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { -#if EIGEN_HAS_TYPE_TRAITS // The behaviour of memcpy is not specified for non-trivially copyable types EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value, THIS_TYPE_IS_NOT_SUPPORTED); EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value && std::is_default_constructible::value, THIS_TYPE_IS_NOT_SUPPORTED); -#endif - EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED); + Tgt tgt; + // Load src into registers first. This allows the memcpy to be elided by CUDA. + const Src staged = src; EIGEN_USING_STD(memcpy) - memcpy(&tgt, &src, sizeof(Tgt)); + memcpy(static_cast(&tgt),static_cast(&staged), sizeof(Tgt)); return tgt; } } // namespace numext @@ -162,11 +164,7 @@ template struct GenericNumTraits }; typedef T Real; - typedef typename internal::conditional< - IsInteger, - typename internal::conditional::type, - T - >::type NonInteger; + typedef std::conditional_t, T> NonInteger; typedef T Nested; typedef T Literal; @@ -252,15 +250,15 @@ template<> struct NumTraits static inline long double dummy_precision() { return 1e-15l; } }; -template struct NumTraits > - : GenericNumTraits > +template struct NumTraits > + : GenericNumTraits > { - typedef _Real Real; - typedef typename NumTraits<_Real>::Literal Literal; + typedef Real_ Real; + typedef typename NumTraits::Literal Literal; enum { IsComplex = 1, - RequireInitialization = NumTraits<_Real>::RequireInitialization, - ReadCost = 2 * NumTraits<_Real>::ReadCost, + RequireInitialization = NumTraits::RequireInitialization, + ReadCost = 2 * NumTraits::ReadCost, AddCost = 2 * NumTraits::AddCost, MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost }; diff --git a/libs/eigen/Eigen/src/Core/PartialReduxEvaluator.h b/libs/eigen/Eigen/src/Core/PartialReduxEvaluator.h index 29abf35..693fc35 100644 --- a/libs/eigen/Eigen/src/Core/PartialReduxEvaluator.h +++ b/libs/eigen/Eigen/src/Core/PartialReduxEvaluator.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PARTIALREDUX_H #define EIGEN_PARTIALREDUX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -29,7 +31,7 @@ namespace internal { * some (optional) processing of the outcome, e.g., division by n for mean. * * For the vectorized path let's observe that the packet-size and outer-unrolling -* are both decided by the assignement logic. So all we have to do is to decide +* are both decided by the assignment logic. So all we have to do is to decide * on the inner unrolling. * * For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, @@ -54,12 +56,17 @@ struct packetwise_redux_traits /* Value to be returned when size==0 , by default let's return 0 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } +PacketType packetwise_redux_empty_value(const Func& ) { + const typename unpacket_traits::type zero(0); + return pset1(zero); +} /* For products the default is 1 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { + return pset1(Scalar(1)); +} /* Perform the actual reduction */ template > { typedef PartialReduxExpr XprType; typedef typename internal::nested_eval::type ArgTypeNested; - typedef typename internal::add_const_on_value_type::type ConstArgTypeNested; - typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef add_const_on_value_type_t ConstArgTypeNested; + typedef internal::remove_all_t ArgTypeNestedCleaned; typedef typename ArgType::Scalar InputScalar; typedef typename XprType::Scalar Scalar; enum { @@ -147,16 +154,16 @@ struct evaluator > : TraversalSize==0 ? 1 : int(TraversalSize) * int(evaluator::CoeffReadCost) + int(CostOpType::value), - _ArgFlags = evaluator::Flags, + ArgFlags_ = evaluator::Flags, - _Vectorizable = bool(int(_ArgFlags)&PacketAccessBit) + Vectorizable_ = bool(int(ArgFlags_)&PacketAccessBit) && bool(MemberOp::Vectorizable) - && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0) + && (Direction==int(Vertical) ? bool(ArgFlags_&RowMajorBit) : (ArgFlags_&RowMajorBit)==0) && (TraversalSize!=0), Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) - | (_Vectorizable ? PacketAccessBit : 0) + | (Vectorizable_ ? PacketAccessBit : 0) | LinearAccessBit, Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized diff --git a/libs/eigen/Eigen/src/Core/PermutationMatrix.h b/libs/eigen/Eigen/src/Core/PermutationMatrix.h index 69401bf..73a7300 100644 --- a/libs/eigen/Eigen/src/Core/PermutationMatrix.h +++ b/libs/eigen/Eigen/src/Core/PermutationMatrix.h @@ -11,6 +11,8 @@ #ifndef EIGEN_PERMUTATIONMATRIX_H #define EIGEN_PERMUTATIONMATRIX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -269,13 +271,13 @@ class PermutationBase : public EigenBase }; namespace internal { -template -struct traits > - : traits > +template +struct traits > + : traits > { typedef PermutationStorage StorageKind; - typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType; - typedef _StorageIndex StorageIndex; + typedef Matrix IndicesType; + typedef StorageIndex_ StorageIndex; typedef void Scalar; }; } @@ -287,14 +289,14 @@ struct traits -class PermutationMatrix : public PermutationBase > +template +class PermutationMatrix : public PermutationBase > { typedef PermutationBase Base; typedef internal::traits Traits; @@ -389,20 +391,20 @@ class PermutationMatrix : public PermutationBase -struct traits,_PacketAccess> > - : traits > +template +struct traits,PacketAccess_> > + : traits > { typedef PermutationStorage StorageKind; - typedef Map, _PacketAccess> IndicesType; - typedef _StorageIndex StorageIndex; + typedef Map, PacketAccess_> IndicesType; + typedef StorageIndex_ StorageIndex; typedef void Scalar; }; } -template -class Map,_PacketAccess> - : public PermutationBase,_PacketAccess> > +template +class Map,PacketAccess_> + : public PermutationBase,PacketAccess_> > { typedef PermutationBase Base; typedef internal::traits Traits; @@ -452,18 +454,18 @@ class Map class TranspositionsWrapper; +template class TranspositionsWrapper; namespace internal { -template -struct traits > +template +struct traits > { typedef PermutationStorage StorageKind; typedef void Scalar; - typedef typename _IndicesType::Scalar StorageIndex; - typedef _IndicesType IndicesType; + typedef typename IndicesType_::Scalar StorageIndex; + typedef IndicesType_ IndicesType; enum { - RowsAtCompileTime = _IndicesType::SizeAtCompileTime, - ColsAtCompileTime = _IndicesType::SizeAtCompileTime, + RowsAtCompileTime = IndicesType_::SizeAtCompileTime, + ColsAtCompileTime = IndicesType_::SizeAtCompileTime, MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime, MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime, Flags = 0 @@ -476,14 +478,14 @@ struct traits > * * \brief Class to view a vector of integers as a permutation matrix * - * \tparam _IndicesType the type of the vector of integer (can be any compatible expression) + * \tparam IndicesType_ the type of the vector of integer (can be any compatible expression) * * This class allows to view any vector expression of integers as a permutation matrix. * * \sa class PermutationBase, class PermutationMatrix */ -template -class PermutationWrapper : public PermutationBase > +template +class PermutationWrapper : public PermutationBase > { typedef PermutationBase Base; typedef internal::traits Traits; @@ -498,7 +500,7 @@ class PermutationWrapper : public PermutationBase::type& + const internal::remove_all_t& indices() const { return m_indices; } protected: diff --git a/libs/eigen/Eigen/src/Core/PlainObjectBase.h b/libs/eigen/Eigen/src/Core/PlainObjectBase.h index e2ddbd1..60a75b1 100644 --- a/libs/eigen/Eigen/src/Core/PlainObjectBase.h +++ b/libs/eigen/Eigen/src/Core/PlainObjectBase.h @@ -22,23 +22,20 @@ # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED #endif +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { template struct check_rows_cols_for_overflow { - template - EIGEN_DEVICE_FUNC - static EIGEN_ALWAYS_INLINE void run(Index, Index) - { - } + template + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index, Index) {} }; template<> struct check_rows_cols_for_overflow { - template - EIGEN_DEVICE_FUNC - static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols) - { + template + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index rows, Index cols) { // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242 // we assume Index is signed Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed @@ -64,18 +61,18 @@ namespace doxygen { // This is a workaround to doxygen not being able to understand the inheritance logic // when it is hidden by the dense_xpr_base helper struct. // Moreover, doxygen fails to include members that are not documented in the declaration body of -// MatrixBase if we inherits MatrixBase >, +// MatrixBase if we inherits MatrixBase >, // this is why we simply inherits MatrixBase, though this does not make sense. /** This class is just a workaround for Doxygen and it does not not actually exist. */ template struct dense_xpr_base_dispatcher; /** This class is just a workaround for Doxygen and it does not not actually exist. */ -template -struct dense_xpr_base_dispatcher > +template +struct dense_xpr_base_dispatcher > : public MatrixBase {}; /** This class is just a workaround for Doxygen and it does not not actually exist. */ -template -struct dense_xpr_base_dispatcher > +template +struct dense_xpr_base_dispatcher > : public ArrayBase {}; } // namespace doxygen @@ -134,6 +131,16 @@ class PlainObjectBase : public internal::dense_xpr_base::type enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits::Alignment>0) }; EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) + EIGEN_STATIC_ASSERT(internal::check_implication(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT(internal::check_implication(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT((MaxColsAtCompileTime == Dynamic) || (MaxColsAtCompileTime >= 0), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT((MaxRowsAtCompileTime == RowsAtCompileTime || RowsAtCompileTime==Dynamic), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT((MaxColsAtCompileTime == ColsAtCompileTime || ColsAtCompileTime==Dynamic), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_STATIC_ASSERT(((Options & (DontAlign|RowMajor)) == Options), INVALID_MATRIX_TEMPLATE_PARAMETERS) + EIGEN_DEVICE_FUNC Base& base() { return *static_cast(this); } EIGEN_DEVICE_FUNC @@ -148,12 +155,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. * * See DenseCoeffsBase::coeff(Index) const for details. */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const - { - if(Flags & RowMajorBit) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index rowId, Index colId) const { + if (Flags & RowMajorBit) return m_storage.data()[colId + rowId * m_storage.cols()]; - else // column-major + else // column-major return m_storage.data()[rowId + colId * m_storage.rows()]; } @@ -171,12 +176,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. * * See DenseCoeffsBase::coeffRef(Index,Index) const for details. */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId) - { - if(Flags & RowMajorBit) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index rowId, Index colId) { + if (Flags & RowMajorBit) return m_storage.data()[colId + rowId * m_storage.cols()]; - else // column-major + else // column-major return m_storage.data()[rowId + colId * m_storage.rows()]; } @@ -184,28 +187,20 @@ class PlainObjectBase : public internal::dense_xpr_base::type * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. * * See DenseCoeffsBase::coeffRef(Index) const for details. */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - return m_storage.data()[index]; - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { return m_storage.data()[index]; } /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index). * It is provided for convenience. */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const - { - if(Flags & RowMajorBit) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index rowId, Index colId) const { + if (Flags & RowMajorBit) return m_storage.data()[colId + rowId * m_storage.cols()]; - else // column-major + else // column-major return m_storage.data()[rowId + colId * m_storage.rows()]; } /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index). * It is provided for convenience. */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index index) const { return m_storage.data()[index]; } @@ -267,13 +262,11 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t) */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void resize(Index rows, Index cols) - { - eigen_assert( EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime) - && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime) - && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime) - && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index rows, Index cols) { + eigen_assert(internal::check_implication(RowsAtCompileTime!=Dynamic, rows==RowsAtCompileTime) + && internal::check_implication(ColsAtCompileTime!=Dynamic, cols==ColsAtCompileTime) + && internal::check_implication(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic, rows<=MaxRowsAtCompileTime) + && internal::check_implication(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic, cols<=MaxColsAtCompileTime) && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array."); internal::check_rows_cols_for_overflow::run(rows, cols); #ifdef EIGEN_INITIALIZE_COEFFS @@ -297,12 +290,13 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t) */ - EIGEN_DEVICE_FUNC - inline void resize(Index size) - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase) - eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime==Dynamic || size<=MaxSizeAtCompileTime)) || SizeAtCompileTime == size) && size>=0); - #ifdef EIGEN_INITIALIZE_COEFFS + EIGEN_DEVICE_FUNC inline constexpr void resize(Index size) { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase) + eigen_assert( + ((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime == Dynamic || size <= MaxSizeAtCompileTime)) || + SizeAtCompileTime == size) && + size >= 0); +#ifdef EIGEN_INITIALIZE_COEFFS bool size_changed = size != this->size(); #endif if(RowsAtCompileTime == 1) @@ -322,11 +316,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC - inline void resize(NoChange_t, Index cols) - { - resize(rows(), cols); - } + EIGEN_DEVICE_FUNC inline constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); } /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange * as in the example below. @@ -336,11 +326,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC - inline void resize(Index rows, NoChange_t) - { - resize(rows, cols()); - } + EIGEN_DEVICE_FUNC inline constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); } /** Resizes \c *this to have the same dimensions as \a other. * Takes care of doing all the checking that's needed. @@ -475,7 +461,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase() : m_storage() { -// _check_template_params(); // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } @@ -486,11 +471,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert) : m_storage(internal::constructor_without_unaligned_array_assert()) { -// _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } #endif -#if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT : m_storage( std::move(other.m_storage) ) @@ -500,11 +484,9 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT { - _check_template_params(); m_storage = std::move(other.m_storage); return *this; } -#endif /** Copy constructor */ EIGEN_DEVICE_FUNC @@ -514,17 +496,14 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols) : m_storage(size, rows, cols) { -// _check_template_params(); // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } - #if EIGEN_HAS_CXX11 - /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. * * \only_for_vectors * * This constructor is for 1D array or vectors with more than 4 coefficients. - * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. * * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this * constructor must match the the fixed number of rows (resp. columns) of \c *this. @@ -534,7 +513,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) : m_storage() { - _check_template_params(); EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4); m_storage.data()[0] = a0; m_storage.data()[1] = a1; @@ -546,14 +524,11 @@ class PlainObjectBase : public internal::dense_xpr_base::type } /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer - * lists \cpp11 + * lists */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list) - : m_storage() - { - _check_template_params(); - + EIGEN_DEVICE_FUNC explicit constexpr EIGEN_STRONG_INLINE PlainObjectBase( + const std::initializer_list>& list) + : m_storage() { size_t list_size = 0; if (list.begin() != list.end()) { list_size = list.begin()->size(); @@ -581,7 +556,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type } } } - #endif // end EIGEN_HAS_CXX11 /** \sa PlainObjectBase::operator=(const EigenBase&) */ template @@ -589,7 +563,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase &other) : m_storage() { - _check_template_params(); resizeLike(other); _set_noalias(other); } @@ -600,7 +573,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase &other) : m_storage() { - _check_template_params(); resizeLike(other); *this = other.derived(); } @@ -609,7 +581,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue& other) { - _check_template_params(); // FIXME this does not automatically transpose vectors if necessary resize(other.rows(), other.cols()); other.evalTo(this->derived()); @@ -640,7 +611,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \see class Map */ - //@{ + ///@{ static inline ConstMapType Map(const Scalar* data) { return ConstMapType(data); } static inline MapType Map(Scalar* data) @@ -704,7 +675,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type template static inline typename StridedAlignedMapType >::type MapAligned(Scalar* data, Index rows, Index cols, const Stride& stride) { return typename StridedAlignedMapType >::type(data, rows, cols, stride); } - //@} + ///@} using Base::setConstant; EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val); @@ -800,7 +771,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if::type* = 0) + EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, std::enable_if_t* = 0) { const bool t0_is_integer_alike = internal::is_valid_index_type::value; const bool t1_is_integer_alike = internal::is_valid_index_type::value; @@ -812,7 +783,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if::type* = 0) + EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, std::enable_if_t* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) m_storage.data()[0] = Scalar(val0); @@ -822,10 +793,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1, - typename internal::enable_if< (!internal::is_same::value) - && (internal::is_same::value) - && (internal::is_same::value) - && Base::SizeAtCompileTime==2,T1>::type* = 0) + std::enable_if_t< (!internal::is_same::value) + && (internal::is_same::value) + && (internal::is_same::value) + && Base::SizeAtCompileTime==2,T1>* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) m_storage.data()[0] = Scalar(val0); @@ -836,8 +807,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type // then the argument is meant to be the size of the object. template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if< (Base::SizeAtCompileTime!=1 || !internal::is_convertible::value) - && ((!internal::is_same::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0) + EIGEN_STRONG_INLINE void _init1(Index size, std::enable_if_t< (Base::SizeAtCompileTime!=1 || !internal::is_convertible::value) + && ((!internal::is_same::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>* = 0) { // NOTE MSVC 2008 complains if we directly put bool(NumTraits::IsInteger) as the EIGEN_STATIC_ASSERT argument. const bool is_integer_alike = internal::is_valid_index_type::value; @@ -850,7 +821,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) + EIGEN_STRONG_INLINE void _init1(const Scalar& val0, std::enable_if_t::value,T>* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) m_storage.data()[0] = val0; @@ -860,10 +831,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Index& val0, - typename internal::enable_if< (!internal::is_same::value) - && (internal::is_same::value) - && Base::SizeAtCompileTime==1 - && internal::is_convertible::value,T*>::type* = 0) + std::enable_if_t< (!internal::is_same::value) + && (internal::is_same::value) + && Base::SizeAtCompileTime==1 + && internal::is_convertible::value,T*>* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) m_storage.data()[0] = Scalar(val0); @@ -916,10 +887,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, - typename internal::enable_if< Base::SizeAtCompileTime!=Dynamic - && Base::SizeAtCompileTime!=1 - && internal::is_convertible::value - && internal::is_same::XprKind,ArrayXpr>::value,T>::type* = 0) + std::enable_if_t< Base::SizeAtCompileTime!=Dynamic + && Base::SizeAtCompileTime!=1 + && internal::is_convertible::value + && internal::is_same::XprKind,ArrayXpr>::value,T>* = 0) { Base::setConstant(val0); } @@ -928,12 +899,12 @@ class PlainObjectBase : public internal::dense_xpr_base::type template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Index& val0, - typename internal::enable_if< (!internal::is_same::value) - && (internal::is_same::value) - && Base::SizeAtCompileTime!=Dynamic - && Base::SizeAtCompileTime!=1 - && internal::is_convertible::value - && internal::is_same::XprKind,ArrayXpr>::value,T*>::type* = 0) + std::enable_if_t< (!internal::is_same::value) + && (internal::is_same::value) + && Base::SizeAtCompileTime!=Dynamic + && Base::SizeAtCompileTime!=1 + && internal::is_convertible::value + && internal::is_same::XprKind,ArrayXpr>::value,T*>* = 0) { Base::setConstant(val0); } @@ -964,21 +935,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type void swap(DenseBase const & other) { Base::swap(other.derived()); } - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void _check_template_params() - { - EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor) - && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0) - && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0)) - && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0)) - && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0)) - && ((MaxColsAtCompileTime == Dynamic) || (MaxColsAtCompileTime >= 0)) - && (MaxRowsAtCompileTime == RowsAtCompileTime || RowsAtCompileTime==Dynamic) - && (MaxColsAtCompileTime == ColsAtCompileTime || ColsAtCompileTime==Dynamic) - && (Options & (DontAlign|RowMajor)) == Options), - INVALID_MATRIX_TEMPLATE_PARAMETERS) - } - enum { IsPlainObjectBase = 1 }; #endif public: @@ -999,11 +955,7 @@ namespace internal { template struct conservative_resize_like_impl { - #if EIGEN_HAS_TYPE_TRAITS - static const bool IsRelocatable = std::is_trivially_copyable::value; - #else - static const bool IsRelocatable = !NumTraits::RequireInitialization; - #endif + static constexpr bool IsRelocatable = std::is_trivially_copyable::value; static void run(DenseBase& _this, Index rows, Index cols) { if (_this.rows() == rows && _this.cols() == cols) return; diff --git a/libs/eigen/Eigen/src/Core/Product.h b/libs/eigen/Eigen/src/Core/Product.h index 70a6c10..85842d1 100644 --- a/libs/eigen/Eigen/src/Core/Product.h +++ b/libs/eigen/Eigen/src/Core/Product.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PRODUCT_H #define EIGEN_PRODUCT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template class ProductImpl; @@ -19,8 +21,8 @@ namespace internal { template struct traits > { - typedef typename remove_all::type LhsCleaned; - typedef typename remove_all::type RhsCleaned; + typedef remove_all_t LhsCleaned; + typedef remove_all_t RhsCleaned; typedef traits LhsTraits; typedef traits RhsTraits; @@ -40,7 +42,7 @@ struct traits > MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime, // FIXME: only needed by GeneralMatrixMatrixTriangular - InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime), + InnerSize = min_size_prefer_fixed(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime), // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator. Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit @@ -58,8 +60,8 @@ struct traits > * * \brief Expression of the product of two arbitrary matrices or vectors * - * \tparam _Lhs the type of the left-hand side expression - * \tparam _Rhs the type of the right-hand side expression + * \tparam Lhs_ the type of the left-hand side expression + * \tparam Rhs_ the type of the right-hand side expression * * This class represents an expression of the product of two arbitrary matrices. * @@ -67,16 +69,16 @@ struct traits > * \tparam Option can be DefaultProduct, AliasFreeProduct, or LazyProduct * */ -template -class Product : public ProductImpl<_Lhs,_Rhs,Option, - typename internal::product_promote_storage_type::StorageKind, - typename internal::traits<_Rhs>::StorageKind, - internal::product_type<_Lhs,_Rhs>::ret>::ret> +template +class Product : public ProductImpl::StorageKind, + typename internal::traits::StorageKind, + internal::product_type::ret>::ret> { public: - typedef _Lhs Lhs; - typedef _Rhs Rhs; + typedef Lhs_ Lhs; + typedef Rhs_ Rhs; typedef typename ProductImpl< Lhs, Rhs, Option, @@ -87,8 +89,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, typedef typename internal::ref_selector::type LhsNested; typedef typename internal::ref_selector::type RhsNested; - typedef typename internal::remove_all::type LhsNestedCleaned; - typedef typename internal::remove_all::type RhsNestedCleaned; + typedef internal::remove_all_t LhsNestedCleaned; + typedef internal::remove_all_t RhsNestedCleaned; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) diff --git a/libs/eigen/Eigen/src/Core/ProductEvaluators.h b/libs/eigen/Eigen/src/Core/ProductEvaluators.h index 8cf294b..9da2406 100644 --- a/libs/eigen/Eigen/src/Core/ProductEvaluators.h +++ b/libs/eigen/Eigen/src/Core/ProductEvaluators.h @@ -13,6 +13,8 @@ #ifndef EIGEN_PRODUCTEVALUATORS_H #define EIGEN_PRODUCTEVALUATORS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -107,14 +109,14 @@ struct product_evaluator, ProductTag, LhsShape, RhsSh explicit product_evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); // FIXME shall we handle nested_eval here?, // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.) // typedef typename internal::nested_eval::type LhsNested; // typedef typename internal::nested_eval::type RhsNested; -// typedef typename internal::remove_all::type LhsNestedCleaned; -// typedef typename internal::remove_all::type RhsNestedCleaned; +// typedef internal::remove_all_t LhsNestedCleaned; +// typedef internal::remove_all_t RhsNestedCleaned; // // const LhsNested lhs(xpr.lhs()); // const RhsNested rhs(xpr.rhs()); @@ -134,7 +136,7 @@ protected: // Dense = Product template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar> struct Assignment, internal::assign_op, Dense2Dense, - typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> + std::enable_if_t<(Options==DefaultProduct || Options==AliasFreeProduct)>> { typedef Product SrcXprType; static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -152,7 +154,7 @@ struct Assignment, internal::assign_op struct Assignment, internal::add_assign_op, Dense2Dense, - typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> + std::enable_if_t<(Options==DefaultProduct || Options==AliasFreeProduct)>> { typedef Product SrcXprType; static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -167,7 +169,7 @@ struct Assignment, internal::add_assign_op< // Dense -= Product template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar> struct Assignment, internal::sub_assign_op, Dense2Dense, - typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> + std::enable_if_t<(Options==DefaultProduct || Options==AliasFreeProduct)>> { typedef Product SrcXprType; static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -296,7 +298,7 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, cons template struct generic_product_impl { - template struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; + template struct is_row_major : std::conditional_t<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type> {}; typedef typename Product::Scalar Scalar; // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose @@ -370,7 +372,7 @@ struct generic_product_impl typedef typename nested_eval::type RhsNested; typedef typename Product::Scalar Scalar; enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight }; - typedef typename internal::remove_all::type>::type MatrixType; + typedef internal::remove_all_t> MatrixType; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) @@ -427,8 +429,8 @@ struct generic_product_impl // 3 - it makes this fallback consistent with the heavy GEMM routine. // 4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices. // (see https://stackoverflow.com/questions/54738495) - // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower, - // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently + // For small fixed sizes matrices, however, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... This is why this re-writing strategy is currently // enabled only when falling back from the main GEMM. template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -448,7 +450,7 @@ struct generic_product_impl blas_traits::extract(rhs).template conjugateIf(), func, actualAlpha, - typename conditional::type()); + std::conditional_t()); } protected: @@ -458,7 +460,7 @@ protected: void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s /* == 1 */, false_type) { EIGEN_UNUSED_VARIABLE(s); - eigen_internal_assert(s==Scalar(1)); + eigen_internal_assert(numext::is_exactly_one(s)); call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); } @@ -526,8 +528,8 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename internal::nested_eval::type LhsNested; typedef typename internal::nested_eval::type RhsNested; - typedef typename internal::remove_all::type LhsNestedCleaned; - typedef typename internal::remove_all::type RhsNestedCleaned; + typedef internal::remove_all_t LhsNestedCleaned; + typedef internal::remove_all_t RhsNestedCleaned; typedef evaluator LhsEtorType; typedef evaluator RhsEtorType; @@ -535,7 +537,7 @@ struct product_evaluator, ProductTag, DenseShape, enum { RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime, ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime, - InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime), + InnerSize = min_size_prefer_fixed(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime), MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime, MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime }; @@ -564,8 +566,8 @@ struct product_evaluator, ProductTag, DenseShape, RhsVecPacketSize = unpacket_traits::size, // Here, we don't care about alignment larger than the usable packet size. - LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), - RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), + LhsAlignment = plain_enum_min(LhsEtorType::Alignment, LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), + RhsAlignment = plain_enum_min(RhsEtorType::Alignment, RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), SameType = is_same::value, @@ -585,8 +587,8 @@ struct product_evaluator, ProductTag, DenseShape, LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)), RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)), - Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment) - : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment) + Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % plain_enum_max(1, LhsAlignment))!=0 ? 0 : LhsAlignment) + : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % plain_enum_max(1, RhsAlignment))!=0 ? 0 : RhsAlignment) : 0, /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside @@ -640,8 +642,8 @@ struct product_evaluator, ProductTag, DenseShape, } protected: - typename internal::add_const_on_value_type::type m_lhs; - typename internal::add_const_on_value_type::type m_rhs; + add_const_on_value_type_t m_lhs; + add_const_on_value_type_t m_rhs; LhsEtorType m_lhsImpl; RhsEtorType m_rhsImpl; @@ -836,22 +838,22 @@ public: MatrixFlags = evaluator::Flags, DiagFlags = evaluator::Flags, - _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor + StorageOrder_ = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor : MatrixFlags & RowMajorBit ? RowMajor : ColMajor, - _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor), + SameStorageOrder_ = StorageOrder_ == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor), - _ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft) - ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)), - _SameTypes = is_same::value, + ScalarAccessOnDiag_ = !((int(StorageOrder_) == ColMajor && int(ProductOrder) == OnTheLeft) + ||(int(StorageOrder_) == RowMajor && int(ProductOrder) == OnTheRight)), + SameTypes_ = is_same::value, // FIXME currently we need same types, but in the future the next rule should be the one - //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))), - _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) - && _SameTypes - && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit) - && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), - _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, - Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), + //Vectorizable_ = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (SameTypes_ && bool(int(DiagFlags)&PacketAccessBit))), + Vectorizable_ = bool(int(MatrixFlags)&PacketAccessBit) + && SameTypes_ + && (SameStorageOrder_ || (MatrixFlags&LinearAccessBit)==LinearAccessBit) + && (ScalarAccessOnDiag_ || (bool(int(DiagFlags)&PacketAccessBit))), + LinearAccessMask_ = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, + Flags = ((HereditaryBits|LinearAccessMask_) & (unsigned int)(MatrixFlags)) | (Vectorizable_ ? PacketAccessBit : 0), Alignment = evaluator::Alignment, AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) @@ -887,7 +889,7 @@ protected: { enum { InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime, - DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator::Alignment)) // FIXME hardcoded 16!! + DiagonalPacketLoadMode = plain_enum_min(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator::Alignment)) // FIXME hardcoded 16!! }; return internal::pmul(m_matImpl.template packet(row, col), m_diagImpl.template packet(id)); @@ -913,7 +915,7 @@ struct product_evaluator, ProductTag, DiagonalSha typedef typename Lhs::DiagonalVectorType DiagonalType; - enum { StorageOrder = Base::_StorageOrder }; + enum { StorageOrder = Base::StorageOrder_ }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) @@ -932,7 +934,7 @@ struct product_evaluator, ProductTag, DiagonalSha // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case. // See also similar calls below. return this->template packet_impl(row,col, row, - typename internal::conditional::type()); + std::conditional_t()); } template @@ -957,7 +959,7 @@ struct product_evaluator, ProductTag, DenseShape, typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - enum { StorageOrder = Base::_StorageOrder }; + enum { StorageOrder = Base::StorageOrder_ }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) @@ -974,7 +976,7 @@ struct product_evaluator, ProductTag, DenseShape, EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return this->template packet_impl(row,col, col, - typename internal::conditional::type()); + std::conditional_t()); } template @@ -1001,7 +1003,7 @@ template struct permutation_matrix_product { typedef typename nested_eval::type MatrixType; - typedef typename remove_all::type MatrixTypeCleaned; + typedef remove_all_t MatrixTypeCleaned; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) @@ -1109,7 +1111,7 @@ template::type MatrixType; - typedef typename remove_all::type MatrixTypeCleaned; + typedef remove_all_t MatrixTypeCleaned; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) @@ -1172,6 +1174,40 @@ struct generic_product_impl, MatrixShape, TranspositionsShap } }; +/*************************************************************************** +* skew symmetric products +* for now we just call the generic implementation +***************************************************************************/ +template +struct generic_product_impl +{ + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + { + generic_product_impl::evalTo(dst, lhs, rhs); + } +}; + +template +struct generic_product_impl +{ + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + { + generic_product_impl::evalTo(dst, lhs, rhs); + } +}; + +template +struct generic_product_impl +{ + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + { + generic_product_impl::evalTo(dst, lhs, rhs); + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/Random.h b/libs/eigen/Eigen/src/Core/Random.h index dab2ac8..fab6889 100644 --- a/libs/eigen/Eigen/src/Core/Random.h +++ b/libs/eigen/Eigen/src/Core/Random.h @@ -10,12 +10,13 @@ #ifndef EIGEN_RANDOM_H #define EIGEN_RANDOM_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { template struct scalar_random_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op) inline const Scalar operator() () const { return random(); } }; diff --git a/libs/eigen/Eigen/src/Core/Redux.h b/libs/eigen/Eigen/src/Core/Redux.h index b6790d1..796e6c4 100644 --- a/libs/eigen/Eigen/src/Core/Redux.h +++ b/libs/eigen/Eigen/src/Core/Redux.h @@ -11,6 +11,8 @@ #ifndef EIGEN_REDUX_H #define EIGEN_REDUX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -198,8 +200,7 @@ struct redux_impl Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); - Scalar res; - res = eval.coeffByOuterInner(0, 0); + Scalar res = eval.coeffByOuterInner(0, 0); for(Index i = 1; i < xpr.innerSize(); ++i) res = func(res, eval.coeffByOuterInner(0, i)); for(Index i = 1; i < xpr.outerSize(); ++i) @@ -238,7 +239,7 @@ struct redux_impl const int packetAlignment = unpacket_traits::alignment; enum { alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) + alignment = plain_enum_max(alignment0, Evaluator::Alignment) }; const Index alignedStart = internal::first_default_aligned(xpr); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); @@ -353,12 +354,12 @@ struct redux_impl }; // evaluator adaptor -template -class redux_evaluator : public internal::evaluator<_XprType> +template +class redux_evaluator : public internal::evaluator { - typedef internal::evaluator<_XprType> Base; + typedef internal::evaluator Base; public: - typedef _XprType XprType; + typedef XprType_ XprType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} diff --git a/libs/eigen/Eigen/src/Core/Ref.h b/libs/eigen/Eigen/src/Core/Ref.h index c2a37ea..81de5f9 100644 --- a/libs/eigen/Eigen/src/Core/Ref.h +++ b/libs/eigen/Eigen/src/Core/Ref.h @@ -10,20 +10,22 @@ #ifndef EIGEN_REF_H #define EIGEN_REF_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template -struct traits > - : public traits > +template +struct traits > + : public traits > { - typedef _PlainObjectType PlainObjectType; - typedef _StrideType StrideType; + typedef PlainObjectType_ PlainObjectType; + typedef StrideType_ StrideType; enum { - Options = _Options, - Flags = traits >::Flags | NestByRefBit, - Alignment = traits >::Alignment + Options = Options_, + Flags = traits >::Flags | NestByRefBit, + Alignment = traits >::Alignment }; template struct match { @@ -46,7 +48,7 @@ struct traits > ScalarTypeMatch = internal::is_same::value, MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch }; - typedef typename internal::conditional::type type; + typedef std::conditional_t type; }; }; @@ -197,8 +199,8 @@ protected: return false; } - ::new (static_cast(this)) Base(expr.data(), rows, cols); - ::new (&m_stride) StrideBase( + internal::construct_at(this, expr.data(), rows, cols); + internal::construct_at(&m_stride, (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride, (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride ); return true; @@ -285,7 +287,7 @@ template class Ref typedef internal::traits Traits; template EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase& expr, - typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0); + std::enable_if_t::MatchAtCompileTime),Derived>* = 0); public: typedef RefBase Base; @@ -295,17 +297,17 @@ template class Ref #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase& expr, - typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) + std::enable_if_t::MatchAtCompileTime),Derived>* = 0) { EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); - // Construction must pass since we will not create temprary storage in the non-const case. + // Construction must pass since we will not create temporary storage in the non-const case. const bool success = Base::construct(expr.derived()); EIGEN_UNUSED_VARIABLE(success) eigen_assert(success); } template EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, - typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) + std::enable_if_t::MatchAtCompileTime),Derived>* = 0) #else /** Implicit constructor from any dense expression */ template @@ -337,7 +339,7 @@ template class Ref< template EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, - typename internal::enable_if::ScalarTypeMatch),Derived>::type* = 0) + std::enable_if_t::ScalarTypeMatch),Derived>* = 0) { // std::cout << match_helper::HasDirectAccess << "," << match_helper::OuterStrideMatch << "," << match_helper::InnerStrideMatch << "\n"; // std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n"; diff --git a/libs/eigen/Eigen/src/Core/Replicate.h b/libs/eigen/Eigen/src/Core/Replicate.h index ab5be7e..4f91bbe 100644 --- a/libs/eigen/Eigen/src/Core/Replicate.h +++ b/libs/eigen/Eigen/src/Core/Replicate.h @@ -10,6 +10,8 @@ #ifndef EIGEN_REPLICATE_H #define EIGEN_REPLICATE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -21,7 +23,7 @@ struct traits > typedef typename traits::StorageKind StorageKind; typedef typename traits::XprKind XprKind; typedef typename ref_selector::type MatrixTypeNested; - typedef typename remove_reference::type _MatrixTypeNested; + typedef std::remove_reference_t MatrixTypeNested_; enum { RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic ? Dynamic @@ -62,19 +64,19 @@ template class Replicate : public internal::dense_xpr_base< Replicate >::type { typedef typename internal::traits::MatrixTypeNested MatrixTypeNested; - typedef typename internal::traits::_MatrixTypeNested _MatrixTypeNested; + typedef typename internal::traits::MatrixTypeNested_ MatrixTypeNested_; public: typedef typename internal::dense_xpr_base::type Base; EIGEN_DENSE_PUBLIC_INTERFACE(Replicate) - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; template EIGEN_DEVICE_FUNC inline explicit Replicate(const OriginalMatrixType& matrix) : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor) { - EIGEN_STATIC_ASSERT((internal::is_same::type,OriginalMatrixType>::value), + EIGEN_STATIC_ASSERT((internal::is_same,OriginalMatrixType>::value), THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) eigen_assert(RowFactor!=Dynamic && ColFactor!=Dynamic); } @@ -84,7 +86,7 @@ template class Replicate inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor) : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor) { - EIGEN_STATIC_ASSERT((internal::is_same::type,OriginalMatrixType>::value), + EIGEN_STATIC_ASSERT((internal::is_same,OriginalMatrixType>::value), THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) } @@ -94,7 +96,7 @@ template class Replicate inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); } EIGEN_DEVICE_FUNC - const _MatrixTypeNested& nestedExpression() const + const MatrixTypeNested_& nestedExpression() const { return m_matrix; } diff --git a/libs/eigen/Eigen/src/Core/Reshaped.h b/libs/eigen/Eigen/src/Core/Reshaped.h index 52de73b..81355ac 100644 --- a/libs/eigen/Eigen/src/Core/Reshaped.h +++ b/libs/eigen/Eigen/src/Core/Reshaped.h @@ -11,6 +11,8 @@ #ifndef EIGEN_RESHAPED_H #define EIGEN_RESHAPED_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class Reshaped @@ -27,10 +29,9 @@ namespace Eigen { * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and * most of the time this is the only way it is used. * - * However, in C++98, if you want to directly maniputate reshaped expressions, - * for instance if you want to write a function returning such an expression, you - * will need to use this class. In C++11, it is advised to use the \em auto - * keyword for such use cases. + * If you want to directly manipulate reshaped expressions, + * for instance if you want to write a function returning such an expression, + * it is advised to use the \em auto keyword for such use cases. * * Here is an example illustrating the dynamic case: * \include class_Reshaped.cpp @@ -156,7 +157,7 @@ class ReshapedImpl_dense EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) typedef typename internal::ref_selector::non_const_type MatrixTypeNested; - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; class InnerIterator; @@ -186,12 +187,12 @@ class ReshapedImpl_dense /** \returns the nested expression */ EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& + const internal::remove_all_t& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC - typename internal::remove_reference::type& + std::remove_reference_t& nestedExpression() { return m_xpr; } protected: @@ -231,7 +232,7 @@ class ReshapedImpl_dense {} EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& nestedExpression() const + const internal::remove_all_t& nestedExpression() const { return m_xpr; } @@ -250,7 +251,7 @@ class ReshapedImpl_dense EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { - return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); + return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride(); } protected: @@ -324,7 +325,7 @@ struct reshaped_evaluator RowCol; - inline RowCol index_remap(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC inline RowCol index_remap(Index rowId, Index colId) const { if(Order==ColMajor) { @@ -443,7 +444,7 @@ struct reshaped_evaluator(xpr) { // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime - eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); + eigen_assert(((internal::UIntPtr(xpr.data()) % plain_enum_max(1, evaluator::Alignment)) == 0) && "data is not aligned"); } }; diff --git a/libs/eigen/Eigen/src/Core/ReturnByValue.h b/libs/eigen/Eigen/src/Core/ReturnByValue.h index 4dad13e..9025282 100644 --- a/libs/eigen/Eigen/src/Core/ReturnByValue.h +++ b/libs/eigen/Eigen/src/Core/ReturnByValue.h @@ -11,6 +11,8 @@ #ifndef EIGEN_RETURNBYVALUE_H #define EIGEN_RETURNBYVALUE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -104,7 +106,7 @@ struct evaluator > EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); xpr.evalTo(m_result); } diff --git a/libs/eigen/Eigen/src/Core/Reverse.h b/libs/eigen/Eigen/src/Core/Reverse.h index 28cdd76..97e1d68 100644 --- a/libs/eigen/Eigen/src/Core/Reverse.h +++ b/libs/eigen/Eigen/src/Core/Reverse.h @@ -12,6 +12,8 @@ #ifndef EIGEN_REVERSE_H #define EIGEN_REVERSE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -24,13 +26,13 @@ struct traits > typedef typename traits::StorageKind StorageKind; typedef typename traits::XprKind XprKind; typedef typename ref_selector::type MatrixTypeNested; - typedef typename remove_reference::type _MatrixTypeNested; + typedef std::remove_reference_t MatrixTypeNested_; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit) + Flags = MatrixTypeNested_::Flags & (RowMajorBit | LvalueBit) }; }; @@ -67,7 +69,7 @@ template class Reverse typedef typename internal::dense_xpr_base::type Base; EIGEN_DENSE_PUBLIC_INTERFACE(Reverse) - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; using Base::IsRowMajor; protected: @@ -99,7 +101,7 @@ template class Reverse return -m_matrix.innerStride(); } - EIGEN_DEVICE_FUNC const typename internal::remove_all::type& + EIGEN_DEVICE_FUNC const internal::remove_all_t& nestedExpression() const { return m_matrix; @@ -173,10 +175,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { - const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2; + constexpr Index HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2; Index half = xpr.rows()/2; - xpr.topRows(fix(half)) - .swap(xpr.bottomRows(fix(half)).colwise().reverse()); + xpr.template topRows(half) + .swap(xpr.template bottomRows(half).colwise().reverse()); } }; @@ -186,10 +188,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { - const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2; + constexpr Index HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2; Index half = xpr.cols()/2; - xpr.leftCols(fix(half)) - .swap(xpr.rightCols(fix(half)).rowwise().reverse()); + xpr.template leftCols(half) + .swap(xpr.template rightCols(half).rowwise().reverse()); } }; diff --git a/libs/eigen/Eigen/src/Core/Select.h b/libs/eigen/Eigen/src/Core/Select.h index 7c86bf8..d9ed2b2 100644 --- a/libs/eigen/Eigen/src/Core/Select.h +++ b/libs/eigen/Eigen/src/Core/Select.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SELECT_H #define EIGEN_SELECT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class Select @@ -17,9 +19,9 @@ namespace Eigen { * * \brief Expression of a coefficient wise version of the C++ ternary operator ?: * - * \param ConditionMatrixType the type of the \em condition expression which must be a boolean matrix - * \param ThenMatrixType the type of the \em then expression - * \param ElseMatrixType the type of the \em else expression + * \tparam ConditionMatrixType the type of the \em condition expression which must be a boolean matrix + * \tparam ThenMatrixType the type of the \em then expression + * \tparam ElseMatrixType the type of the \em else expression * * This class represents an expression of a coefficient wise version of the C++ ternary operator ?:. * It is the return type of DenseBase::select() and most of the time this is the only way it is used. diff --git a/libs/eigen/Eigen/src/Core/SelfAdjointView.h b/libs/eigen/Eigen/src/Core/SelfAdjointView.h index 8ce3b37..7a930db 100644 --- a/libs/eigen/Eigen/src/Core/SelfAdjointView.h +++ b/libs/eigen/Eigen/src/Core/SelfAdjointView.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SELFADJOINTMATRIX_H #define EIGEN_SELFADJOINTMATRIX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class SelfAdjointView @@ -18,8 +20,8 @@ namespace Eigen { * * \brief Expression of a selfadjoint matrix from a triangular part of a dense matrix * - * \param MatrixType the type of the dense matrix storing the coefficients - * \param TriangularPart can be either \c #Lower or \c #Upper + * \tparam MatrixType the type of the dense matrix storing the coefficients + * \tparam TriangularPart can be either \c #Lower or \c #Upper * * This class is an expression of a sefladjoint matrix from a triangular part of a matrix * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView() @@ -33,7 +35,7 @@ template struct traits > : traits { typedef typename ref_selector::non_const_type MatrixTypeNested; - typedef typename remove_all::type MatrixTypeNestedCleaned; + typedef remove_all_t MatrixTypeNestedCleaned; typedef MatrixType ExpressionType; typedef typename MatrixType::PlainObject FullMatrixType; enum { @@ -46,12 +48,13 @@ struct traits > : traits } -template class SelfAdjointView - : public TriangularBase > +template class SelfAdjointView + : public TriangularBase > { public: + EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY) - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef TriangularBase Base; typedef typename internal::traits::MatrixTypeNested MatrixTypeNested; typedef typename internal::traits::MatrixTypeNestedCleaned MatrixTypeNestedCleaned; @@ -60,8 +63,8 @@ template class SelfAdjointView /** \brief The type of coefficients in this matrix */ typedef typename internal::traits::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; - typedef typename internal::remove_all::type MatrixConjugateReturnType; - typedef SelfAdjointView::type, UpLo> ConstSelfAdjointView; + typedef internal::remove_all_t MatrixConjugateReturnType; + typedef SelfAdjointView, UpLo> ConstSelfAdjointView; enum { Mode = internal::traits::Mode, @@ -71,10 +74,7 @@ template class SelfAdjointView typedef typename MatrixType::PlainObject PlainObject; EIGEN_DEVICE_FUNC - explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) - { - EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); - } + explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) { } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } @@ -180,16 +180,16 @@ template class SelfAdjointView */ template EIGEN_DEVICE_FUNC - typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), - TriangularView, - TriangularView >::type + std::conditional_t<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), + TriangularView, + TriangularView > triangularView() const { - typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix); - typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1); - return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), - TriangularView, - TriangularView >::type(tmp2); + std::conditional_t<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType> tmp1(m_matrix); + std::conditional_t<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType> tmp2(tmp1); + return std::conditional_t<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), + TriangularView, + TriangularView >(tmp2); } typedef SelfAdjointView ConjugateReturnType; @@ -203,10 +203,10 @@ template class SelfAdjointView */ template EIGEN_DEVICE_FUNC - inline typename internal::conditional::type + inline std::conditional_t conjugateIf() const { - typedef typename internal::conditional::type ReturnType; + typedef std::conditional_t ReturnType; return ReturnType(m_matrix.template conjugateIf()); } @@ -218,10 +218,10 @@ template class SelfAdjointView typedef SelfAdjointView TransposeReturnType; /** \sa MatrixBase::transpose() */ + template EIGEN_DEVICE_FUNC - inline TransposeReturnType transpose() + inline TransposeReturnType transpose(std::enable_if_t::value, Dummy*> = nullptr) { - EIGEN_STATIC_ASSERT_LVALUE(MatrixType) typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } diff --git a/libs/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h b/libs/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h index 7c89c2e..14dbec0 100644 --- a/libs/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/libs/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SELFCWISEBINARYOP_H #define EIGEN_SELFCWISEBINARYOP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { // TODO generalize the scalar type of 'other' diff --git a/libs/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h b/libs/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h new file mode 100644 index 0000000..7f6b5fd --- /dev/null +++ b/libs/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h @@ -0,0 +1,412 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// Copyright (C) 2007-2009 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKEWSYMMETRICMATRIX3_H +#define EIGEN_SKEWSYMMETRICMATRIX3_H + +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class SkewSymmetricBase + * \ingroup Core_Module + * + * \brief Base class for skew symmetric matrices and expressions + * + * This is the base class that is inherited by SkewSymmetricMatrix3 and related expression + * types, which internally use a three vector for storing the entries. SkewSymmetric + * types always represent square three times three matrices. + * + * This implementations follows class DiagonalMatrix + * + * \tparam Derived is the derived type, a SkewSymmetricMatrix3 or SkewSymmetricWrapper. + * + * \sa class SkewSymmetricMatrix3, class SkewSymmetricWrapper + */ +template +class SkewSymmetricBase : public EigenBase +{ + public: + typedef typename internal::traits::SkewSymmetricVectorType SkewSymmetricVectorType; + typedef typename SkewSymmetricVectorType::Scalar Scalar; + typedef typename SkewSymmetricVectorType::RealScalar RealScalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::StorageIndex StorageIndex; + + enum { + RowsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime, + ColsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime, + MaxRowsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime, + MaxColsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime, + IsVectorAtCompileTime = 0, + Flags = NoPreferredStorageOrderBit + }; + + typedef Matrix DenseMatrixType; + typedef DenseMatrixType DenseType; + typedef SkewSymmetricMatrix3 PlainObject; + + /** \returns a reference to the derived object. */ + EIGEN_DEVICE_FUNC + inline const Derived& derived() const { return *static_cast(this); } + /** \returns a const reference to the derived object. */ + EIGEN_DEVICE_FUNC + inline Derived& derived() { return *static_cast(this); } + + /** + * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type, + * not an expression. + * \returns A dense matrix, with its entries set from the the derived object. */ + EIGEN_DEVICE_FUNC + DenseMatrixType toDenseMatrix() const { return derived(); } + + /** Determinant vanishes */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Scalar determinant() const { return 0; } + + /** A.transpose() = -A */ + EIGEN_DEVICE_FUNC + PlainObject transpose() const { return (-vector()).asSkewSymmetric(); } + + /** \returns the exponential of this matrix using Rodrigues’ formula */ + EIGEN_DEVICE_FUNC + DenseMatrixType exponential() const { + DenseMatrixType retVal = DenseMatrixType::Identity(); + const SkewSymmetricVectorType& v = vector(); + if (v.isZero()) { + return retVal; + } + const Scalar norm2 = v.squaredNorm(); + const Scalar norm = numext::sqrt(norm2); + retVal += ((((1 - numext::cos(norm))/norm2)*derived())*derived()) + (numext::sin(norm)/norm)*derived().toDenseMatrix(); + return retVal; + } + + /** \returns a reference to the derived object's vector of coefficients. */ + EIGEN_DEVICE_FUNC + inline const SkewSymmetricVectorType& vector() const { return derived().vector(); } + /** \returns a const reference to the derived object's vector of coefficients. */ + EIGEN_DEVICE_FUNC + inline SkewSymmetricVectorType& vector() { return derived().vector(); } + + /** \returns the number of rows. */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const { return 3; } + /** \returns the number of columns. */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const { return 3; } + + /** \returns the matrix product of \c *this by the dense matrix, \a matrix */ + template + EIGEN_DEVICE_FUNC + Product + operator*(const MatrixBase &matrix) const + { + return Product(derived(), matrix.derived()); + } + + /** \returns the matrix product of \c *this by the skew symmetric matrix, \a matrix */ + template + EIGEN_DEVICE_FUNC + Product + operator*(const SkewSymmetricBase &matrix) const + { + return Product(derived(), matrix.derived()); + } + + template + using SkewSymmetricProductReturnType = SkewSymmetricWrapper; + + /** \returns the wedge product of \c *this by the skew symmetric matrix \a other + * A wedge B = AB - BA */ + template + EIGEN_DEVICE_FUNC SkewSymmetricProductReturnType wedge( + const SkewSymmetricBase& other) const { + return vector().cross(other.vector()).asSkewSymmetric(); + } + + using SkewSymmetricScaleReturnType = + SkewSymmetricWrapper; + + /** \returns the product of \c *this by the scalar \a scalar */ + EIGEN_DEVICE_FUNC + inline SkewSymmetricScaleReturnType operator*(const Scalar& scalar) const { + return (vector() * scalar).asSkewSymmetric(); + } + + using ScaleSkewSymmetricReturnType = + SkewSymmetricWrapper; + + /** \returns the product of a scalar and the skew symmetric matrix \a other */ + EIGEN_DEVICE_FUNC + friend inline ScaleSkewSymmetricReturnType operator*(const Scalar& scalar, const SkewSymmetricBase& other) { + return (scalar * other.vector()).asSkewSymmetric(); + } + + template + using SkewSymmetricSumReturnType = SkewSymmetricWrapper; + + /** \returns the sum of \c *this and the skew symmetric matrix \a other */ + template + EIGEN_DEVICE_FUNC inline SkewSymmetricSumReturnType operator+( + const SkewSymmetricBase& other) const { + return (vector() + other.vector()).asSkewSymmetric(); + } + + template + using SkewSymmetricDifferenceReturnType = SkewSymmetricWrapper; + + /** \returns the difference of \c *this and the skew symmetric matrix \a other */ + template + EIGEN_DEVICE_FUNC inline SkewSymmetricDifferenceReturnType operator-( + const SkewSymmetricBase& other) const { + return (vector() - other.vector()).asSkewSymmetric(); + } +}; + +/** \class SkewSymmetricMatrix3 + * \ingroup Core_Module + * + * \brief Represents a 3x3 skew symmetric matrix with its storage + * + * \tparam Scalar_ the type of coefficients + * + * \sa class SkewSymmetricBase, class SkewSymmetricWrapper + */ + +namespace internal { +template +struct traits > + : traits > +{ + typedef Matrix SkewSymmetricVectorType; + typedef SkewSymmetricShape StorageKind; + enum { + Flags = LvalueBit | NoPreferredStorageOrderBit | NestByRefBit + }; +}; +} +template +class SkewSymmetricMatrix3 + : public SkewSymmetricBase > +{ + public: + #ifndef EIGEN_PARSED_BY_DOXYGEN + typedef typename internal::traits::SkewSymmetricVectorType SkewSymmetricVectorType; + typedef const SkewSymmetricMatrix3& Nested; + typedef Scalar_ Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::StorageIndex StorageIndex; + #endif + + protected: + + SkewSymmetricVectorType m_vector; + + public: + + /** const version of vector(). */ + EIGEN_DEVICE_FUNC + inline const SkewSymmetricVectorType& vector() const { return m_vector; } + /** \returns a reference to the stored vector of coefficients. */ + EIGEN_DEVICE_FUNC + inline SkewSymmetricVectorType& vector() { return m_vector; } + + /** Default constructor without initialization */ + EIGEN_DEVICE_FUNC + inline SkewSymmetricMatrix3() {} + + /** Constructor from three scalars */ + EIGEN_DEVICE_FUNC + inline SkewSymmetricMatrix3(const Scalar& x, const Scalar& y, const Scalar& z) : m_vector(x,y,z) {} + + /** \brief Constructs a SkewSymmetricMatrix3 from an r-value vector type */ + EIGEN_DEVICE_FUNC + explicit inline SkewSymmetricMatrix3(SkewSymmetricVectorType&& vec) : m_vector(std::move(vec)) {} + + /** generic constructor from expression of the coefficients */ + template + EIGEN_DEVICE_FUNC + explicit inline SkewSymmetricMatrix3(const MatrixBase& other) : m_vector(other) + {} + + /** Copy constructor. */ + template + EIGEN_DEVICE_FUNC + inline SkewSymmetricMatrix3(const SkewSymmetricBase& other) : m_vector(other.vector()) {} + + #ifndef EIGEN_PARSED_BY_DOXYGEN + /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */ + inline SkewSymmetricMatrix3(const SkewSymmetricMatrix3& other) : m_vector(other.vector()) {} + #endif + + /** Copy operator. */ + template + EIGEN_DEVICE_FUNC + SkewSymmetricMatrix3& operator=(const SkewSymmetricBase& other) + { + m_vector = other.vector(); + return *this; + } + + #ifndef EIGEN_PARSED_BY_DOXYGEN + /** This is a special case of the templated operator=. Its purpose is to + * prevent a default operator= from hiding the templated operator=. + */ + EIGEN_DEVICE_FUNC + SkewSymmetricMatrix3& operator=(const SkewSymmetricMatrix3& other) + { + m_vector = other.vector(); + return *this; + } + #endif + + typedef SkewSymmetricWrapper, SkewSymmetricVectorType>> + InitializeReturnType; + + /** Initializes a skew symmetric matrix with coefficients set to zero */ + EIGEN_DEVICE_FUNC + static InitializeReturnType Zero() { return SkewSymmetricVectorType::Zero().asSkewSymmetric(); } + + /** Sets all coefficients to zero. */ + EIGEN_DEVICE_FUNC + inline void setZero() { m_vector.setZero(); } +}; + +/** \class SkewSymmetricWrapper + * \ingroup Core_Module + * + * \brief Expression of a skew symmetric matrix + * + * \tparam SkewSymmetricVectorType_ the type of the vector of coefficients + * + * This class is an expression of a skew symmetric matrix, but not storing its own vector of coefficients, + * instead wrapping an existing vector expression. It is the return type of MatrixBase::asSkewSymmetric() + * and most of the time this is the only way that it is used. + * + * \sa class SkewSymmetricMatrix3, class SkewSymmetricBase, MatrixBase::asSkewSymmetric() + */ + +namespace internal { +template +struct traits > +{ + typedef SkewSymmetricVectorType_ SkewSymmetricVectorType; + typedef typename SkewSymmetricVectorType::Scalar Scalar; + typedef typename SkewSymmetricVectorType::StorageIndex StorageIndex; + typedef SkewSymmetricShape StorageKind; + typedef typename traits::XprKind XprKind; + enum { + RowsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime, + ColsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime, + MaxRowsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime, + MaxColsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime, + Flags = (traits::Flags & LvalueBit) | NoPreferredStorageOrderBit + }; +}; +} + +template +class SkewSymmetricWrapper + : public SkewSymmetricBase >, internal::no_assignment_operator +{ + public: + #ifndef EIGEN_PARSED_BY_DOXYGEN + typedef SkewSymmetricVectorType_ SkewSymmetricVectorType; + typedef SkewSymmetricWrapper Nested; + #endif + + /** Constructor from expression of coefficients to wrap. */ + EIGEN_DEVICE_FUNC + explicit inline SkewSymmetricWrapper(SkewSymmetricVectorType& a_vector) : m_vector(a_vector) {} + + /** \returns a const reference to the wrapped expression of coefficients. */ + EIGEN_DEVICE_FUNC + const SkewSymmetricVectorType& vector() const { return m_vector; } + + protected: + typename SkewSymmetricVectorType::Nested m_vector; +}; + +/** \returns a pseudo-expression of a skew symmetric matrix with *this as vector of coefficients + * + * \only_for_vectors + * + * \sa class SkewSymmetricWrapper, class SkewSymmetricMatrix3, vector(), isSkewSymmetric() + **/ +template +EIGEN_DEVICE_FUNC inline const SkewSymmetricWrapper +MatrixBase::asSkewSymmetric() const +{ + return SkewSymmetricWrapper(derived()); +} + +/** \returns true if *this is approximately equal to a skew symmetric matrix, + * within the precision given by \a prec. + */ +template +bool MatrixBase::isSkewSymmetric(const RealScalar& prec) const +{ + if(cols() != rows()) return false; + return (this->transpose() + *this).isZero(prec); +} + +/** \returns the matrix product of \c *this by the skew symmetric matrix \skew. + */ +template +template +EIGEN_DEVICE_FUNC inline const Product +MatrixBase::operator*(const SkewSymmetricBase &skew) const +{ + return Product(derived(), skew.derived()); +} + +namespace internal { + +template<> struct storage_kind_to_shape { typedef SkewSymmetricShape Shape; }; + +struct SkewSymmetric2Dense {}; + +template<> struct AssignmentKind { typedef SkewSymmetric2Dense Kind; }; + +// SkewSymmetric matrix to Dense assignment +template< typename DstXprType, typename SrcXprType, typename Functor> +struct Assignment +{ + static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) + { + if((dst.rows()!=3) || (dst.cols()!=3)) { + dst.resize(3, 3); + } + dst.diagonal().setZero(); + const typename SrcXprType::SkewSymmetricVectorType v = src.vector(); + dst(0, 1) = -v(2); + dst(1, 0) = v(2); + dst(0, 2) = v(1); + dst(2, 0) = -v(1); + dst(1, 2) = -v(0); + dst(2, 1) = v(0); + } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) + { dst.vector() += src.vector(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) + { dst.vector() -= src.vector(); } +}; + +} // namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SKEWSYMMETRICMATRIX3_H diff --git a/libs/eigen/Eigen/src/Core/Solve.h b/libs/eigen/Eigen/src/Core/Solve.h index 23d5cb7..f77eac9 100644 --- a/libs/eigen/Eigen/src/Core/Solve.h +++ b/libs/eigen/Eigen/src/Core/Solve.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SOLVE_H #define EIGEN_SOLVE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template class SolveImpl; @@ -77,7 +79,7 @@ public: protected: const Decomposition &m_dec; - const RhsType &m_rhs; + const typename internal::ref_selector::type m_rhs; }; @@ -123,7 +125,7 @@ struct evaluator > EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve) : m_result(solve.rows(), solve.cols()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); solve.dec()._solve_impl(solve.rhs(), m_result); } diff --git a/libs/eigen/Eigen/src/Core/SolveTriangular.h b/libs/eigen/Eigen/src/Core/SolveTriangular.h index dfbf995..71d6f85 100644 --- a/libs/eigen/Eigen/src/Core/SolveTriangular.h +++ b/libs/eigen/Eigen/src/Core/SolveTriangular.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SOLVETRIANGULAR_H #define EIGEN_SOLVETRIANGULAR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -87,7 +89,7 @@ struct triangular_solver_selector static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { - typename internal::add_const_on_value_type::type actualLhs = LhsProductTraits::extract(lhs); + add_const_on_value_type_t actualLhs = LhsProductTraits::extract(lhs); const Index size = lhs.rows(); const Index othersize = Side==OnTheLeft? rhs.cols() : rhs.rows(); @@ -174,11 +176,11 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(c return; enum { copy = (internal::traits::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1}; - typedef typename internal::conditional::type, OtherDerived&>::type OtherCopy; + typedef std::conditional_t::type, OtherDerived&> OtherCopy; OtherCopy otherCopy(other); - internal::triangular_solver_selector::type, + internal::triangular_solver_selector, Side, Mode>::run(derived().nestedExpression(), otherCopy); if (copy) @@ -206,7 +208,7 @@ struct traits > template struct triangular_solve_retval : public ReturnByValue > { - typedef typename remove_all::type RhsNestedCleaned; + typedef remove_all_t RhsNestedCleaned; typedef ReturnByValue Base; triangular_solve_retval(const TriangularType& tri, const Rhs& rhs) diff --git a/libs/eigen/Eigen/src/Core/SolverBase.h b/libs/eigen/Eigen/src/Core/SolverBase.h index 5014610..7396e04 100644 --- a/libs/eigen/Eigen/src/Core/SolverBase.h +++ b/libs/eigen/Eigen/src/Core/SolverBase.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SOLVERBASE_H #define EIGEN_SOLVERBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -28,7 +30,7 @@ struct solve_assertion > template static void run(const type& transpose, const Rhs& b) { - internal::solve_assertion::type>::template run(transpose.nestedExpression(), b); + internal::solve_assertion>::template run(transpose.nestedExpression(), b); } }; @@ -40,7 +42,7 @@ struct solve_assertion template static void run(const type& adjoint, const Rhs& b) { - internal::solve_assertion >::type>::template run(adjoint.nestedExpression(), b); + internal::solve_assertion >>::template run(adjoint.nestedExpression(), b); } }; } // end namespace internal @@ -79,12 +81,11 @@ class SolverBase : public EigenBase enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, - SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, - internal::traits::ColsAtCompileTime>::ret), + SizeAtCompileTime = (internal::size_of_xpr_at_compile_time::ret), MaxRowsAtCompileTime = internal::traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = internal::traits::MaxColsAtCompileTime, - MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, - internal::traits::MaxColsAtCompileTime>::ret), + MaxSizeAtCompileTime = internal::size_at_compile_time(internal::traits::MaxRowsAtCompileTime, + internal::traits::MaxColsAtCompileTime), IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 || internal::traits::MaxColsAtCompileTime == 1, NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2 @@ -105,12 +106,12 @@ class SolverBase : public EigenBase inline const Solve solve(const MatrixBase& b) const { - internal::solve_assertion::type>::template run(derived(), b); + internal::solve_assertion>::template run(derived(), b); return Solve(derived(), b.derived()); } /** \internal the return type of transpose() */ - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; /** \returns an expression of the transposed of the factored matrix. * * A typical usage is to solve for the transposed problem A^T x = b: @@ -118,16 +119,16 @@ class SolverBase : public EigenBase * * \sa adjoint(), solve() */ - inline ConstTransposeReturnType transpose() const + inline const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } /** \internal the return type of adjoint() */ - typedef typename internal::conditional::IsComplex, - CwiseUnaryOp, ConstTransposeReturnType>, - ConstTransposeReturnType - >::type AdjointReturnType; + typedef std::conditional_t::IsComplex, + CwiseUnaryOp, const ConstTransposeReturnType>, + const ConstTransposeReturnType + > AdjointReturnType; /** \returns an expression of the adjoint of the factored matrix * * A typical usage is to solve for the adjoint problem A' x = b: @@ -137,7 +138,7 @@ class SolverBase : public EigenBase * * \sa transpose(), solve() */ - inline AdjointReturnType adjoint() const + inline const AdjointReturnType adjoint() const { return AdjointReturnType(derived().transpose()); } diff --git a/libs/eigen/Eigen/src/Core/StableNorm.h b/libs/eigen/Eigen/src/Core/StableNorm.h index 4a3f0cc..a3bc918 100644 --- a/libs/eigen/Eigen/src/Core/StableNorm.h +++ b/libs/eigen/Eigen/src/Core/StableNorm.h @@ -10,6 +10,8 @@ #ifndef EIGEN_STABLENORM_H #define EIGEN_STABLENORM_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -57,7 +59,7 @@ void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealSca const Index blockSize = 4096; typedef typename internal::nested_eval::type VectorTypeCopy; - typedef typename internal::remove_all::type VectorTypeCopyClean; + typedef internal::remove_all_t VectorTypeCopyClean; const VectorTypeCopy copy(vec); enum { @@ -66,8 +68,8 @@ void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealSca ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization }; - typedef typename internal::conditional, internal::evaluator::Alignment>, - typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper; + typedef std::conditional_t, internal::evaluator::Alignment>, + typename VectorTypeCopyClean::ConstSegmentReturnType> SegmentWrapper; Index n = vec.size(); Index bi = internal::first_default_aligned(copy); @@ -79,7 +81,7 @@ void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealSca template typename VectorType::RealScalar -stable_norm_impl(const VectorType &vec, typename enable_if::type* = 0 ) +stable_norm_impl(const VectorType &vec, std::enable_if_t* = 0 ) { using std::sqrt; using std::abs; @@ -101,7 +103,7 @@ stable_norm_impl(const VectorType &vec, typename enable_if typename MatrixType::RealScalar -stable_norm_impl(const MatrixType &mat, typename enable_if::type* = 0 ) +stable_norm_impl(const MatrixType &mat, std::enable_if_t* = 0 ) { using std::sqrt; diff --git a/libs/eigen/Eigen/src/Core/StlIterators.h b/libs/eigen/Eigen/src/Core/StlIterators.h index 09041db..d5d3971 100644 --- a/libs/eigen/Eigen/src/Core/StlIterators.h +++ b/libs/eigen/Eigen/src/Core/StlIterators.h @@ -10,6 +10,8 @@ #ifndef EIGEN_STLITERATORS_H #define EIGEN_STLITERATORS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -25,7 +27,7 @@ protected: typedef typename traits::XprType XprType; typedef indexed_based_stl_iterator_base non_const_iterator; typedef indexed_based_stl_iterator_base const_iterator; - typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + typedef std::conditional_t::value,non_const_iterator,const_iterator> other_iterator; // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: friend class indexed_based_stl_iterator_base; friend class indexed_based_stl_iterator_base; @@ -104,7 +106,7 @@ protected: typedef typename traits::XprType XprType; typedef indexed_based_stl_reverse_iterator_base non_const_iterator; typedef indexed_based_stl_reverse_iterator_base const_iterator; - typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + typedef std::conditional_t::value,non_const_iterator,const_iterator> other_iterator; // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: friend class indexed_based_stl_reverse_iterator_base; friend class indexed_based_stl_reverse_iterator_base; @@ -179,18 +181,18 @@ template class pointer_based_stl_iterator { enum { is_lvalue = internal::is_lvalue::value }; - typedef pointer_based_stl_iterator::type> non_const_iterator; - typedef pointer_based_stl_iterator::type> const_iterator; - typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + typedef pointer_based_stl_iterator> non_const_iterator; + typedef pointer_based_stl_iterator> const_iterator; + typedef std::conditional_t::value,non_const_iterator,const_iterator> other_iterator; // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: - friend class pointer_based_stl_iterator::type>; - friend class pointer_based_stl_iterator::type>; + friend class pointer_based_stl_iterator>; + friend class pointer_based_stl_iterator>; public: typedef Index difference_type; typedef typename XprType::Scalar value_type; typedef std::random_access_iterator_tag iterator_category; - typedef typename internal::conditional::type pointer; - typedef typename internal::conditional::type reference; + typedef std::conditional_t pointer; + typedef std::conditional_t reference; pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {} @@ -256,12 +258,12 @@ protected: internal::variable_if_dynamic m_incr; }; -template -struct indexed_based_stl_iterator_traits > +template +struct indexed_based_stl_iterator_traits > { - typedef _XprType XprType; - typedef generic_randaccess_stl_iterator::type> non_const_iterator; - typedef generic_randaccess_stl_iterator::type> const_iterator; + typedef XprType_ XprType; + typedef generic_randaccess_stl_iterator> non_const_iterator; + typedef generic_randaccess_stl_iterator> const_iterator; }; template @@ -283,13 +285,13 @@ protected: // TODO currently const Transpose/Reshape expressions never returns const references, // so lets return by value too. - //typedef typename internal::conditional::type read_only_ref_t; + //typedef std::conditional_t read_only_ref_t; typedef const value_type read_only_ref_t; public: - typedef typename internal::conditional::type pointer; - typedef typename internal::conditional::type reference; + typedef std::conditional_t pointer; + typedef std::conditional_t reference; generic_randaccess_stl_iterator() : Base() {} generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} @@ -301,12 +303,12 @@ public: pointer operator->() const { return &((*mp_xpr)(m_index)); } }; -template -struct indexed_based_stl_iterator_traits > +template +struct indexed_based_stl_iterator_traits > { - typedef _XprType XprType; - typedef subvector_stl_iterator::type, Direction> non_const_iterator; - typedef subvector_stl_iterator::type, Direction> const_iterator; + typedef XprType_ XprType; + typedef subvector_stl_iterator, Direction> non_const_iterator; + typedef subvector_stl_iterator, Direction> const_iterator; }; template @@ -320,12 +322,12 @@ protected: using Base::m_index; using Base::mp_xpr; - typedef typename internal::conditional::type SubVectorType; - typedef typename internal::conditional::type ConstSubVectorType; + typedef std::conditional_t SubVectorType; + typedef std::conditional_t ConstSubVectorType; public: - typedef typename internal::conditional::type reference; + typedef std::conditional_t reference; typedef typename reference::PlainObject value_type; private: @@ -349,12 +351,12 @@ public: pointer operator->() const { return (*mp_xpr).template subVector(m_index); } }; -template -struct indexed_based_stl_iterator_traits > +template +struct indexed_based_stl_iterator_traits > { - typedef _XprType XprType; - typedef subvector_stl_reverse_iterator::type, Direction> non_const_iterator; - typedef subvector_stl_reverse_iterator::type, Direction> const_iterator; + typedef XprType_ XprType; + typedef subvector_stl_reverse_iterator, Direction> non_const_iterator; + typedef subvector_stl_reverse_iterator, Direction> const_iterator; }; template @@ -368,12 +370,12 @@ protected: using Base::m_index; using Base::mp_xpr; - typedef typename internal::conditional::type SubVectorType; - typedef typename internal::conditional::type ConstSubVectorType; + typedef std::conditional_t SubVectorType; + typedef std::conditional_t ConstSubVectorType; public: - typedef typename internal::conditional::type reference; + typedef std::conditional_t reference; typedef typename reference::PlainObject value_type; private: diff --git a/libs/eigen/Eigen/src/Core/Stride.h b/libs/eigen/Eigen/src/Core/Stride.h index 6494d51..2832e80 100644 --- a/libs/eigen/Eigen/src/Core/Stride.h +++ b/libs/eigen/Eigen/src/Core/Stride.h @@ -10,6 +10,8 @@ #ifndef EIGEN_STRIDE_H #define EIGEN_STRIDE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class Stride @@ -31,27 +33,31 @@ namespace Eigen { * arguments to the constructor. * * Indeed, this class takes two template parameters: - * \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime. - * \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime. + * \tparam OuterStrideAtCompileTime_ the outer stride, or Dynamic if you want to specify it at runtime. + * \tparam InnerStrideAtCompileTime_ the inner stride, or Dynamic if you want to specify it at runtime. * * Here is an example: * \include Map_general_stride.cpp * Output: \verbinclude Map_general_stride.out * - * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime + * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were * not allowed). * + * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1), + * the inner stride is the pointer increment between two consecutive elements, + * regardless of storage layout. + * * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders */ -template +template class Stride { public: typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 enum { - InnerStrideAtCompileTime = _InnerStrideAtCompileTime, - OuterStrideAtCompileTime = _OuterStrideAtCompileTime + InnerStrideAtCompileTime = InnerStrideAtCompileTime_, + OuterStrideAtCompileTime = OuterStrideAtCompileTime_ }; /** Default constructor, for use when strides are fixed at compile time */ diff --git a/libs/eigen/Eigen/src/Core/Swap.h b/libs/eigen/Eigen/src/Core/Swap.h index 180a4e5..b2e7511 100644 --- a/libs/eigen/Eigen/src/Core/Swap.h +++ b/libs/eigen/Eigen/src/Core/Swap.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SWAP_H #define EIGEN_SWAP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/Transpose.h b/libs/eigen/Eigen/src/Core/Transpose.h index 2bc658f..74650ef 100644 --- a/libs/eigen/Eigen/src/Core/Transpose.h +++ b/libs/eigen/Eigen/src/Core/Transpose.h @@ -11,6 +11,8 @@ #ifndef EIGEN_TRANSPOSE_H #define EIGEN_TRANSPOSE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -18,7 +20,7 @@ template struct traits > : public traits { typedef typename ref_selector::type MatrixTypeNested; - typedef typename remove_reference::type MatrixTypeNestedPlain; + typedef std::remove_reference_t MatrixTypeNestedPlain; enum { RowsAtCompileTime = MatrixType::ColsAtCompileTime, ColsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -58,7 +60,7 @@ template class Transpose typedef typename TransposeImpl::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose) - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {} @@ -72,12 +74,12 @@ template class Transpose /** \returns the nested expression */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& + const internal::remove_all_t& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::remove_reference::type& + std::remove_reference_t& nestedExpression() { return m_matrix; } /** \internal */ @@ -130,11 +132,11 @@ template class TransposeImpl EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return derived().nestedExpression().outerStride(); } - typedef typename internal::conditional< - internal::is_lvalue::value, - Scalar, - const Scalar - >::type ScalarWithConstIfNotLvalue; + typedef std::conditional_t< + internal::is_lvalue::value, + Scalar, + const Scalar + > ScalarWithConstIfNotLvalue; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } @@ -178,7 +180,7 @@ template class TransposeImpl * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Transpose +typename DenseBase::TransposeReturnType DenseBase::transpose() { return TransposeReturnType(derived()); @@ -191,7 +193,7 @@ DenseBase::transpose() * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename DenseBase::ConstTransposeReturnType +const typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); diff --git a/libs/eigen/Eigen/src/Core/Transpositions.h b/libs/eigen/Eigen/src/Core/Transpositions.h index 38a7b01..84a9773 100644 --- a/libs/eigen/Eigen/src/Core/Transpositions.h +++ b/libs/eigen/Eigen/src/Core/Transpositions.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TRANSPOSITIONS_H #define EIGEN_TRANSPOSITIONS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template @@ -113,11 +115,11 @@ class TranspositionsBase }; namespace internal { -template -struct traits > - : traits > +template +struct traits > + : traits > { - typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType; + typedef Matrix IndicesType; typedef TranspositionsStorage StorageKind; }; } @@ -151,8 +153,8 @@ struct traits -class Transpositions : public TranspositionsBase > +template +class Transpositions : public TranspositionsBase > { typedef internal::traits Traits; public: @@ -199,19 +201,19 @@ class Transpositions : public TranspositionsBase -struct traits,_PacketAccess> > - : traits > +template +struct traits,PacketAccess_> > + : traits > { - typedef Map, _PacketAccess> IndicesType; - typedef _StorageIndex StorageIndex; + typedef Map, PacketAccess_> IndicesType; + typedef StorageIndex_ StorageIndex; typedef TranspositionsStorage StorageKind; }; } -template -class Map,PacketAccess> - : public TranspositionsBase,PacketAccess> > +template +class Map,PacketAccess> + : public TranspositionsBase,PacketAccess> > { typedef internal::traits Traits; public: @@ -260,17 +262,17 @@ class Map,P }; namespace internal { -template -struct traits > - : traits > +template +struct traits > + : traits > { typedef TranspositionsStorage StorageKind; }; } -template +template class TranspositionsWrapper - : public TranspositionsBase > + : public TranspositionsBase > { typedef internal::traits Traits; public: diff --git a/libs/eigen/Eigen/src/Core/TriangularMatrix.h b/libs/eigen/Eigen/src/Core/TriangularMatrix.h index fdb8bc1..c1bd13a 100644 --- a/libs/eigen/Eigen/src/Core/TriangularMatrix.h +++ b/libs/eigen/Eigen/src/Core/TriangularMatrix.h @@ -11,6 +11,8 @@ #ifndef EIGEN_TRIANGULARMATRIX_H #define EIGEN_TRIANGULARMATRIX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -35,14 +37,13 @@ template class TriangularBase : public EigenBase MaxRowsAtCompileTime = internal::traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = internal::traits::MaxColsAtCompileTime, - SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, - internal::traits::ColsAtCompileTime>::ret), + SizeAtCompileTime = (internal::size_of_xpr_at_compile_time::ret), /**< This is equal to the number of coefficients, i.e. the number of * rows times the number of columns, or to \a Dynamic if this is not * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ - MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, - internal::traits::MaxColsAtCompileTime>::ret) + MaxSizeAtCompileTime = internal::size_at_compile_time(internal::traits::MaxRowsAtCompileTime, + internal::traits::MaxColsAtCompileTime) }; typedef typename internal::traits::Scalar Scalar; @@ -153,8 +154,8 @@ template class TriangularBase : public EigenBase * * \brief Expression of a triangular part in a matrix * - * \param MatrixType the type of the object in which we are taking the triangular part - * \param Mode the kind of triangular matrix expression to construct. Can be #Upper, + * \tparam MatrixType the type of the object in which we are taking the triangular part + * \tparam Mode the kind of triangular matrix expression to construct. Can be #Upper, * #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower. * This is in fact a bit field; it must have either #Upper or #Lower, * and additionally it may have #UnitDiag or #ZeroDiag or neither. @@ -166,39 +167,39 @@ template class TriangularBase : public EigenBase * \sa MatrixBase::triangularView() */ namespace internal { -template -struct traits > : traits +template +struct traits > : traits { typedef typename ref_selector::non_const_type MatrixTypeNested; - typedef typename remove_reference::type MatrixTypeNestedNonRef; - typedef typename remove_all::type MatrixTypeNestedCleaned; + typedef std::remove_reference_t MatrixTypeNestedNonRef; + typedef remove_all_t MatrixTypeNestedCleaned; typedef typename MatrixType::PlainObject FullMatrixType; typedef MatrixType ExpressionType; enum { - Mode = _Mode, + Mode = Mode_, FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) }; }; } -template class TriangularViewImpl; +template class TriangularViewImpl; -template class TriangularView - : public TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind > +template class TriangularView + : public TriangularViewImpl::StorageKind > { public: - typedef TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind > Base; + typedef TriangularViewImpl::StorageKind > Base; typedef typename internal::traits::Scalar Scalar; - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; protected: typedef typename internal::traits::MatrixTypeNested MatrixTypeNested; typedef typename internal::traits::MatrixTypeNestedNonRef MatrixTypeNestedNonRef; - typedef typename internal::remove_all::type MatrixConjugateReturnType; - typedef TriangularView::type, _Mode> ConstTriangularView; + typedef internal::remove_all_t MatrixConjugateReturnType; + typedef TriangularView, Mode_> ConstTriangularView; public: @@ -206,7 +207,7 @@ template class TriangularView typedef typename internal::traits::MatrixTypeNestedCleaned NestedExpression; enum { - Mode = _Mode, + Mode = Mode_, Flags = internal::traits::Flags, TransposeMode = (Mode & Upper ? Lower : 0) | (Mode & Lower ? Upper : 0) @@ -247,10 +248,10 @@ template class TriangularView */ template EIGEN_DEVICE_FUNC - inline typename internal::conditional::type + inline std::conditional_t conjugateIf() const { - typedef typename internal::conditional::type ReturnType; + typedef std::conditional_t ReturnType; return ReturnType(m_matrix.template conjugateIf()); } @@ -262,10 +263,10 @@ template class TriangularView typedef TriangularView TransposeReturnType; /** \sa MatrixBase::transpose() */ + template EIGEN_DEVICE_FUNC - inline TransposeReturnType transpose() + inline TransposeReturnType transpose(std::enable_if_t::value, Dummy*> = nullptr) { - EIGEN_STATIC_ASSERT_LVALUE(MatrixType) typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } @@ -342,16 +343,17 @@ template class TriangularView * * \sa class TriangularView, MatrixBase::triangularView() */ -template class TriangularViewImpl<_MatrixType,_Mode,Dense> - : public TriangularBase > +template class TriangularViewImpl + : public TriangularBase > { public: - typedef TriangularView<_MatrixType, _Mode> TriangularViewType; + typedef TriangularView TriangularViewType; + typedef TriangularBase Base; typedef typename internal::traits::Scalar Scalar; - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef typename MatrixType::PlainObject DenseMatrixType; typedef DenseMatrixType PlainObject; @@ -362,7 +364,7 @@ template class TriangularViewImpl<_Mat typedef typename internal::traits::StorageKind StorageKind; enum { - Mode = _Mode, + Mode = Mode_, Flags = internal::traits::Flags }; @@ -728,10 +730,10 @@ struct evaluator_traits > template struct unary_evaluator, IndexBased> - : evaluator::type> + : evaluator> { typedef TriangularView XprType; - typedef evaluator::type> Base; + typedef evaluator> Base; EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; diff --git a/libs/eigen/Eigen/src/Core/VectorBlock.h b/libs/eigen/Eigen/src/Core/VectorBlock.h index 71c5b95..ee28da1 100644 --- a/libs/eigen/Eigen/src/Core/VectorBlock.h +++ b/libs/eigen/Eigen/src/Core/VectorBlock.h @@ -11,6 +11,8 @@ #ifndef EIGEN_VECTORBLOCK_H #define EIGEN_VECTORBLOCK_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -66,6 +68,7 @@ template class VectorBlock }; public: EIGEN_DENSE_PUBLIC_INTERFACE(VectorBlock) + EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock) using Base::operator=; @@ -76,18 +79,14 @@ template class VectorBlock : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); - } + { } /** Fixed-size constructor */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); - } + { } }; diff --git a/libs/eigen/Eigen/src/Core/VectorwiseOp.h b/libs/eigen/Eigen/src/Core/VectorwiseOp.h index 870f4f1..b004f76 100644 --- a/libs/eigen/Eigen/src/Core/VectorwiseOp.h +++ b/libs/eigen/Eigen/src/Core/VectorwiseOp.h @@ -11,6 +11,8 @@ #ifndef EIGEN_PARTIAL_REDUX_H #define EIGEN_PARTIAL_REDUX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \class PartialReduxExpr @@ -86,7 +88,6 @@ template struct partial_redux_dummy_func; #define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP) \ template \ struct member_##MEMBER { \ - EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ typedef ResultType result_type; \ typedef BINARYOP BinaryOp; \ template struct Cost { enum { value = COST }; }; \ @@ -191,7 +192,7 @@ template class VectorwiseOp typedef typename ExpressionType::RealScalar RealScalar; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 typedef typename internal::ref_selector::non_const_type ExpressionTypeNested; - typedef typename internal::remove_all::type ExpressionTypeNestedCleaned; + typedef internal::remove_all_t ExpressionTypeNestedCleaned; template class Functor, typename ReturnScalar=Scalar> struct ReturnType @@ -230,9 +231,9 @@ template class VectorwiseOp typename ExtendedType::Type extendedTo(const DenseBase& other) const { - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1), + EIGEN_STATIC_ASSERT(internal::check_implication(isVertical, OtherDerived::MaxColsAtCompileTime==1), YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED) - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1), + EIGEN_STATIC_ASSERT(internal::check_implication(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1), YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED) return typename ExtendedType::Type (other.derived(), @@ -253,9 +254,9 @@ template class VectorwiseOp typename OppositeExtendedType::Type extendedToOpposite(const DenseBase& other) const { - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1), + EIGEN_STATIC_ASSERT(internal::check_implication(isHorizontal, OtherDerived::MaxColsAtCompileTime==1), YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED) - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1), + EIGEN_STATIC_ASSERT(internal::check_implication(isVertical, OtherDerived::MaxRowsAtCompileTime==1), YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED) return typename OppositeExtendedType::Type (other.derived(), @@ -594,7 +595,7 @@ template class VectorwiseOp return m_matrix += extendedTo(other.derived()); } - /** Substracts the vector \a other to each subvector of \c *this */ + /** Subtracts the vector \a other to each subvector of \c *this */ template EIGEN_DEVICE_FUNC ExpressionType& operator-=(const DenseBase& other) @@ -604,7 +605,7 @@ template class VectorwiseOp return m_matrix -= extendedTo(other.derived()); } - /** Multiples each subvector of \c *this by the vector \a other */ + /** Multiplies each subvector of \c *this by the vector \a other */ template EIGEN_DEVICE_FUNC ExpressionType& operator*=(const DenseBase& other) diff --git a/libs/eigen/Eigen/src/Core/Visitor.h b/libs/eigen/Eigen/src/Core/Visitor.h index 00bcca8..e1c17fc 100644 --- a/libs/eigen/Eigen/src/Core/Visitor.h +++ b/libs/eigen/Eigen/src/Core/Visitor.h @@ -10,16 +10,23 @@ #ifndef EIGEN_VISITOR_H #define EIGEN_VISITOR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { +template::PacketAccess)> +struct visitor_impl; + template -struct visitor_impl +struct visitor_impl { enum { - col = (UnrollCount-1) / Derived::RowsAtCompileTime, - row = (UnrollCount-1) % Derived::RowsAtCompileTime + col = Derived::IsRowMajor ? (UnrollCount-1) % Derived::ColsAtCompileTime + : (UnrollCount-1) / Derived::RowsAtCompileTime, + row = Derived::IsRowMajor ? (UnrollCount-1) / Derived::ColsAtCompileTime + : (UnrollCount-1) % Derived::RowsAtCompileTime }; EIGEN_DEVICE_FUNC @@ -31,7 +38,7 @@ struct visitor_impl }; template -struct visitor_impl +struct visitor_impl { EIGEN_DEVICE_FUNC static inline void run(const Derived &mat, Visitor& visitor) @@ -42,24 +49,73 @@ struct visitor_impl // This specialization enables visitors on empty matrices at compile-time template -struct visitor_impl { +struct visitor_impl { EIGEN_DEVICE_FUNC static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/) {} }; template -struct visitor_impl +struct visitor_impl { EIGEN_DEVICE_FUNC static inline void run(const Derived& mat, Visitor& visitor) { visitor.init(mat.coeff(0,0), 0, 0); - for(Index i = 1; i < mat.rows(); ++i) - visitor(mat.coeff(i, 0), i, 0); - for(Index j = 1; j < mat.cols(); ++j) - for(Index i = 0; i < mat.rows(); ++i) - visitor(mat.coeff(i, j), i, j); + if (Derived::IsRowMajor) { + for(Index i = 1; i < mat.cols(); ++i) { + visitor(mat.coeff(0, i), 0, i); + } + for(Index j = 1; j < mat.rows(); ++j) { + for(Index i = 0; i < mat.cols(); ++i) { + visitor(mat.coeff(j, i), j, i); + } + } + } else { + for(Index i = 1; i < mat.rows(); ++i) { + visitor(mat.coeff(i, 0), i, 0); + } + for(Index j = 1; j < mat.cols(); ++j) { + for(Index i = 0; i < mat.rows(); ++i) { + visitor(mat.coeff(i, j), i, j); + } + } + } + } +}; + +template +struct visitor_impl +{ + typedef typename Derived::Scalar Scalar; + typedef typename packet_traits::type Packet; + + EIGEN_DEVICE_FUNC + static inline void run(const Derived& mat, Visitor& visitor) + { + const Index PacketSize = packet_traits::size; + visitor.init(mat.coeff(0,0), 0, 0); + if (Derived::IsRowMajor) { + for(Index i = 0; i < mat.rows(); ++i) { + Index j = i == 0 ? 1 : 0; + for(; j+PacketSize-1 < mat.cols(); j += PacketSize) { + Packet p = mat.packet(i, j); + visitor.packet(p, i, j); + } + for(; j < mat.cols(); ++j) + visitor(mat.coeff(i, j), i, j); + } + } else { + for(Index j = 0; j < mat.cols(); ++j) { + Index i = j == 0 ? 1 : 0; + for(; i+PacketSize-1 < mat.rows(); i += PacketSize) { + Packet p = mat.packet(i, j); + visitor.packet(p, i, j); + } + for(; i < mat.rows(); ++i) + visitor(mat.coeff(i, j), i, j); + } + } } }; @@ -68,28 +124,38 @@ template class visitor_evaluator { public: - EIGEN_DEVICE_FUNC - explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef internal::evaluator Evaluator; enum { + PacketAccess = Evaluator::Flags & PacketAccessBit, + IsRowMajor = XprType::IsRowMajor, RowsAtCompileTime = XprType::RowsAtCompileTime, - CoeffReadCost = internal::evaluator::CoeffReadCost + ColsAtCompileTime = XprType::ColsAtCompileTime, + CoeffReadCost = Evaluator::CoeffReadCost }; + + EIGEN_DEVICE_FUNC + explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) { } + + typedef typename XprType::Scalar Scalar; + typedef std::remove_const_t CoeffReturnType; + typedef std::remove_const_t PacketReturnType; + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const { return m_evaluator.coeff(row, col); } + EIGEN_DEVICE_FUNC PacketReturnType packet(Index row, Index col) const + { return m_evaluator.template packet(row, col); } protected: - internal::evaluator m_evaluator; + Evaluator m_evaluator; const XprType &m_xpr; }; + } // end namespace internal /** Applies the visitor \a visitor to the whole coefficients of the matrix or vector. @@ -152,123 +218,131 @@ struct coeff_visitor } }; -/** \internal - * \brief Visitor computing the min coefficient with its value and coordinates - * - * \sa DenseBase::minCoeff(Index*, Index*) - */ -template -struct min_coeff_visitor : coeff_visitor -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - void operator() (const Scalar& value, Index i, Index j) - { - if(value < this->res) - { - this->res = value; - this->row = i; - this->col = j; - } - } -}; -template -struct min_coeff_visitor : coeff_visitor -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - void operator() (const Scalar& value, Index i, Index j) - { - if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res)) - { - this->res = value; - this->row = i; - this->col = j; - } - } -}; - -template -struct min_coeff_visitor : coeff_visitor -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - void operator() (const Scalar& value, Index i, Index j) - { - if((numext::isnan)(value) || value < this->res) - { - this->res = value; - this->row = i; - this->col = j; - } - } +template +struct minmax_compare { + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a < b; } + static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_min(p);} }; template - struct functor_traits > { +struct minmax_compare { + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC inline bool compare(Scalar a, Scalar b) { return a > b; } + static EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& p) { return predux_max(p);} +}; + +template +struct minmax_coeff_visitor : coeff_visitor +{ + using Scalar = typename Derived::Scalar; + using Packet = typename packet_traits::type; + using Comparator = minmax_compare; + + EIGEN_DEVICE_FUNC inline + void operator() (const Scalar& value, Index i, Index j) + { + if(Comparator::compare(value, this->res)) { + this->res = value; + this->row = i; + this->col = j; + } + } + + EIGEN_DEVICE_FUNC inline + void packet(const Packet& p, Index i, Index j) { + const Index PacketSize = packet_traits::size; + Scalar value = Comparator::predux(p); + if (Comparator::compare(value, this->res)) { + const Packet range = preverse(plset(Scalar(1))); + Packet mask = pcmp_eq(pset1(value), p); + Index max_idx = PacketSize - static_cast(predux_max(pand(range, mask))); + this->res = value; + this->row = Derived::IsRowMajor ? i : i + max_idx;; + this->col = Derived::IsRowMajor ? j + max_idx : j; + } + } +}; + +// Suppress NaN. The only case in which we return NaN is if the matrix is all NaN, in which case, +// the row=0, col=0 is returned for the location. +template +struct minmax_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + using Packet = typename packet_traits::type; + using Comparator = minmax_compare; + + EIGEN_DEVICE_FUNC inline + void operator() (const Scalar& value, Index i, Index j) + { + if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) { + this->res = value; + this->row = i; + this->col = j; + } + } + + EIGEN_DEVICE_FUNC inline + void packet(const Packet& p, Index i, Index j) { + const Index PacketSize = packet_traits::size; + Scalar value = Comparator::predux(p); + if ((!(numext::isnan)(value) && (numext::isnan)(this->res)) || Comparator::compare(value, this->res)) { + const Packet range = preverse(plset(Scalar(1))); + /* mask will be zero for NaNs, so they will be ignored. */ + Packet mask = pcmp_eq(pset1(value), p); + Index max_idx = PacketSize - static_cast(predux_max(pand(range, mask))); + this->res = value; + this->row = Derived::IsRowMajor ? i : i + max_idx;; + this->col = Derived::IsRowMajor ? j + max_idx : j; + } + } + +}; + +// Propagate NaN. If the matrix contains NaN, the location of the first NaN will be returned in +// row and col. +template +struct minmax_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + using Packet = typename packet_traits::type; + using Comparator = minmax_compare; + + EIGEN_DEVICE_FUNC inline + void operator() (const Scalar& value, Index i, Index j) + { + const bool value_is_nan = (numext::isnan)(value); + if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) { + this->res = value; + this->row = i; + this->col = j; + } + } + + EIGEN_DEVICE_FUNC inline + void packet(const Packet& p, Index i, Index j) { + const Index PacketSize = packet_traits::size; + Scalar value = Comparator::predux(p); + const bool value_is_nan = (numext::isnan)(value); + if ((value_is_nan && !(numext::isnan)(this->res)) || Comparator::compare(value, this->res)) { + const Packet range = preverse(plset(Scalar(1))); + // If the value is NaN, pick the first position of a NaN, otherwise pick the first extremal value. + Packet mask = value_is_nan ? pnot(pcmp_eq(p, p)) : pcmp_eq(pset1(value), p); + Index max_idx = PacketSize - static_cast(predux_max(pand(range, mask))); + this->res = value; + this->row = Derived::IsRowMajor ? i : i + max_idx;; + this->col = Derived::IsRowMajor ? j + max_idx : j; + } + } +}; + +template +struct functor_traits > { enum { - Cost = NumTraits::AddCost - }; -}; - -/** \internal - * \brief Visitor computing the max coefficient with its value and coordinates - * - * \sa DenseBase::maxCoeff(Index*, Index*) - */ -template -struct max_coeff_visitor : coeff_visitor -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - void operator() (const Scalar& value, Index i, Index j) - { - if(value > this->res) - { - this->res = value; - this->row = i; - this->col = j; - } - } -}; - -template -struct max_coeff_visitor : coeff_visitor -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - void operator() (const Scalar& value, Index i, Index j) - { - if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res)) - { - this->res = value; - this->row = i; - this->col = j; - } - } -}; - -template -struct max_coeff_visitor : coeff_visitor -{ - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - void operator() (const Scalar& value, Index i, Index j) - { - if((numext::isnan)(value) || value > this->res) - { - this->res = value; - this->row = i; - this->col = j; - } - } -}; - -template -struct functor_traits > { - enum { - Cost = NumTraits::AddCost + Cost = NumTraits::AddCost, + PacketAccess = true }; }; @@ -293,7 +367,7 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); - internal::min_coeff_visitor minVisitor; + internal::minmax_coeff_visitor minVisitor; this->visit(minVisitor); *rowId = minVisitor.row; if (colId) *colId = minVisitor.col; @@ -319,7 +393,7 @@ DenseBase::minCoeff(IndexType* index) const eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::min_coeff_visitor minVisitor; + internal::minmax_coeff_visitor minVisitor; this->visit(minVisitor); *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row); return minVisitor.res; @@ -344,7 +418,7 @@ DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); - internal::max_coeff_visitor maxVisitor; + internal::minmax_coeff_visitor maxVisitor; this->visit(maxVisitor); *rowPtr = maxVisitor.row; if (colPtr) *colPtr = maxVisitor.col; @@ -370,7 +444,7 @@ DenseBase::maxCoeff(IndexType* index) const eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::max_coeff_visitor maxVisitor; + internal::minmax_coeff_visitor maxVisitor; this->visit(maxVisitor); *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row; return maxVisitor.res; diff --git a/libs/eigen/Eigen/src/Core/arch/AVX/Complex.h b/libs/eigen/Eigen/src/Core/arch/AVX/Complex.h index ab7bd6c..3abb5bd 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX/Complex.h @@ -10,6 +10,8 @@ #ifndef EIGEN_COMPLEX_AVX_H #define EIGEN_COMPLEX_AVX_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -99,7 +101,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) { - return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) @@ -167,15 +171,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } + EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { - Packet4cf num = pmul(a, pconj(b)); - __m256 tmp = _mm256_mul_ps(b.v, b.v); - __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); - __m256 denom = _mm256_add_ps(tmp, tmp2); - return Packet4cf(_mm256_div_ps(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) @@ -321,10 +322,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { - Packet2cd num = pmul(a, pconj(b)); - __m256d tmp = _mm256_mul_pd(b.v, b.v); - __m256d denom = _mm256_hadd_pd(tmp, tmp); - return Packet2cd(_mm256_div_pd(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) diff --git a/libs/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h index 67041c8..cb7d7b8 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -14,52 +14,78 @@ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f psin(const Packet8f& _x) { return psin_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f pcos(const Packet8f& _x) { return pcos_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f +pasin(const Packet8f& _x) { + return pasin_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f +pacos(const Packet8f& _x) { + return pacos_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f +patan(const Packet8f& _x) { + return patan_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d +patan(const Packet4d& _x) { + return patan_double(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f plog(const Packet8f& _x) { return plog_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d plog(const Packet4d& _x) { return plog_double(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f plog2(const Packet8f& _x) { return plog2_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d plog2(const Packet4d& _x) { return plog2_double(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f plog1p(const Packet8f& _x) { return generic_plog1p(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f pexpm1(const Packet8f& _x) { return generic_expm1(_x); } @@ -68,110 +94,59 @@ Packet8f pexpm1(const Packet8f& _x) { // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f pexp(const Packet8f& _x) { return pexp_float(_x); } // Hyperbolic Tangent function. template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f ptanh(const Packet8f& _x) { return internal::generic_fast_tanh_float(_x); } // Exponential function for doubles. template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d pexp(const Packet4d& _x) { return pexp_double(_x); } -// Functions for sqrt. -// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step -// of Newton's method, at a cost of 1-2 bits of precision as opposed to the -// exact solution. It does not handle +inf, or denormalized numbers correctly. -// The main advantage of this approach is not just speed, but also the fact that -// it can be inlined and pipelined with other computations, further reducing its -// effective latency. This is similar to Quake3's fast inverse square root. -// For detail see here: http://www.beyond3d.com/content/articles/8/ -#if EIGEN_FAST_MATH -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f psqrt(const Packet8f& _x) { - Packet8f minus_half_x = pmul(_x, pset1(-0.5f)); - Packet8f denormal_mask = pandnot( - pcmp_lt(_x, pset1((std::numeric_limits::min)())), - pcmp_lt(_x, pzero(_x))); - // Compute approximate reciprocal sqrt. - Packet8f x = _mm256_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); - // Flush results for denormals to zero. - return pandnot(pmul(_x,x), denormal_mask); -} - -#else - -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +// Notice that for newer processors, it is counterproductive to use Newton +// iteration for square root. In particular, Skylake and Zen2 processors +// have approximately doubled throughput of the _mm_sqrt_ps instruction +// compared to their predecessors. +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f psqrt(const Packet8f& _x) { return _mm256_sqrt_ps(_x); } - -#endif - -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d psqrt(const Packet4d& _x) { return _mm256_sqrt_pd(_x); } + +// Even on Skylake, using Newton iteration is a win for reciprocal square root. #if EIGEN_FAST_MATH -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); - - Packet8f neg_half = pmul(_x, p8f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); - Packet8f inf_mask = _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ); - Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask); - - // Compute an approximate result using the rsqrt intrinsic. - Packet8f y_approx = _mm256_rsqrt_ps(_x); - - // Do a single step of Newton-Raphson iteration to improve the approximation. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five)); - - // Select the result of the Newton-Raphson step for positive normal arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if - // x is zero or a positive denormalized float (equivalent to flushing positive - // denormalized inputs to zero). - return pselect(not_normal_finite_mask, y_approx, y_newton); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet8f prsqrt(const Packet8f& a) { + // _mm256_rsqrt_ps returns -inf for negative denormals. + // _mm512_rsqrt**_ps returns -NaN for negative denormals. We may want + // consistency here. + // const Packet8f rsqrt = pselect(pcmp_lt(a, pzero(a)), + // pset1(-NumTraits::quiet_NaN()), + // _mm256_rsqrt_ps(a)); + return generic_rsqrt_newton_step::run(a, _mm256_rsqrt_ps(a)); } -#else -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x)); +template<> EIGEN_STRONG_INLINE Packet8f preciprocal(const Packet8f& a) { + return generic_reciprocal_newton_step::run(a, _mm256_rcp_ps(a)); } + #endif -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d prsqrt(const Packet4d& _x) { - _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); - return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x)); -} F16_PACKET_FUNCTION(Packet8f, Packet8h, psin) F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos) @@ -183,6 +158,7 @@ F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp) F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh) F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt) F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt) +F16_PACKET_FUNCTION(Packet8f, Packet8h, preciprocal) template <> EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) { @@ -207,6 +183,7 @@ BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, preciprocal) template <> EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) { diff --git a/libs/eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/AVX/PacketMath.h index 7fc32fd..0fe830a 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_AVX_H #define EIGEN_PACKET_MATH_AVX_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -29,28 +31,29 @@ namespace internal { #endif typedef __m256 Packet8f; -typedef __m256i Packet8i; +typedef eigen_packet_wrapper<__m256i, 0> Packet8i; typedef __m256d Packet4d; +#ifndef EIGEN_VECTORIZE_AVX512FP16 typedef eigen_packet_wrapper<__m128i, 2> Packet8h; +#endif typedef eigen_packet_wrapper<__m128i, 3> Packet8bf; +#ifdef EIGEN_VECTORIZE_AVX2 +// Start from 3 to be compatible with AVX512 +typedef eigen_packet_wrapper<__m256i, 3> Packet4l; +#endif + template<> struct is_arithmetic<__m256> { enum { value = true }; }; template<> struct is_arithmetic<__m256i> { enum { value = true }; }; template<> struct is_arithmetic<__m256d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +#ifndef EIGEN_VECTORIZE_AVX512FP16 template<> struct is_arithmetic { enum { value = true }; }; +#endif template<> struct is_arithmetic { enum { value = true }; }; - -#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ - const Packet8f p8f_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ - const Packet4d p4d_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ - const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1(X)) - -#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ - const Packet8i p8i_##NAME = pset1(X) +#ifdef EIGEN_VECTORIZE_AVX2 +template<> struct is_arithmetic { enum { value = true }; }; +#endif // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going // to leverage AVX512 instructions. @@ -67,8 +70,12 @@ template<> struct packet_traits : default_packet_traits HasCmp = 1, HasDiv = 1, + HasReciprocal = EIGEN_FAST_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, + HasACos = 1, + HasASin = 1, + HasATan = 1, HasLog = 1, HasLog1p = 1, HasExpm1 = 1, @@ -102,6 +109,7 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasATan = 1, HasBlend = 1, HasRound = 1, HasFloor = 1, @@ -196,38 +204,74 @@ struct packet_traits : default_packet_traits { HasNdtri = 1 }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + HasCmp = 1, + HasDiv = 1, + size=8 + }; +}; + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4l type; + // There is no half-size packet for current Packet4l. + // TODO: support as SSE path. + typedef Packet4l half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + HasCmp = 1, + size=4 + }; +}; +#endif + #endif template<> struct scalar_div_cost { enum { value = 14 }; }; template<> struct scalar_div_cost { enum { value = 16 }; }; -/* Proper support for integers is only provided by AVX2. In the meantime, we'll - use SSE instructions and packets to deal with integers. -template<> struct packet_traits : default_packet_traits -{ - typedef Packet8i type; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=8 - }; -}; -*/ - template<> struct unpacket_traits { typedef float type; typedef Packet4f half; typedef Packet8i integer_packet; typedef uint8_t mask_t; - enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true}; + enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true +#ifdef EIGEN_VECTORIZE_AVX512 + , masked_fpops_available=true +#endif + }; }; template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; }; -template<> struct unpacket_traits { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; +template<> struct unpacket_traits { + typedef int type; + typedef Packet4i half; + enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +#ifdef EIGEN_VECTORIZE_AVX2 +template<> struct unpacket_traits { + typedef int64_t type; + typedef Packet4l half; + enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +#endif +template<> struct unpacket_traits { + typedef bfloat16 type; + typedef Packet8bf half; + enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; // Helper function for bit packing snippet of low precision comparison. // It packs the flags from 16x16 to 8x16. @@ -236,6 +280,210 @@ EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) { _mm256_extractf128_si256(_mm256_castps_si256(rf), 1)); } +#ifdef EIGEN_VECTORIZE_AVX2 +template <> +EIGEN_STRONG_INLINE Packet4l pset1(const int64_t& from) { + return _mm256_set1_epi64x(from); +} +template <> +EIGEN_STRONG_INLINE Packet4l pzero(const Packet4l& /*a*/) { + return _mm256_setzero_si256(); +} +template <> +EIGEN_STRONG_INLINE Packet4l peven_mask(const Packet4l& /*a*/) { + return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll); +} +template <> +EIGEN_STRONG_INLINE Packet4l pload1(const int64_t* from) { + return _mm256_set1_epi64x(*from); +} +template <> +EIGEN_STRONG_INLINE Packet4l padd(const Packet4l& a, const Packet4l& b) { + return _mm256_add_epi64(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4l plset(const int64_t& a) { + return padd(pset1(a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll))); +} +template <> +EIGEN_STRONG_INLINE Packet4l psub(const Packet4l& a, const Packet4l& b) { + return _mm256_sub_epi64(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4l pnegate(const Packet4l& a) { + return psub(pzero(a), a); +} +template <> +EIGEN_STRONG_INLINE Packet4l pconj(const Packet4l& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4l pcmp_le(const Packet4l& a, const Packet4l& b) { + return _mm256_xor_si256(_mm256_cmpgt_epi64(a, b), _mm256_set1_epi32(-1)); +} +template <> +EIGEN_STRONG_INLINE Packet4l pcmp_lt(const Packet4l& a, const Packet4l& b) { + return _mm256_cmpgt_epi64(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet4l pcmp_eq(const Packet4l& a, const Packet4l& b) { + return _mm256_cmpeq_epi64(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4l ptrue(const Packet4l& a) { + return _mm256_cmpeq_epi64(a, a); +} +template <> +EIGEN_STRONG_INLINE Packet4l pand(const Packet4l& a, const Packet4l& b) { + return _mm256_and_si256(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4l por(const Packet4l& a, const Packet4l& b) { + return _mm256_or_si256(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4l pxor(const Packet4l& a, const Packet4l& b) { + return _mm256_xor_si256(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet4l pandnot(const Packet4l& a, const Packet4l& b) { + return _mm256_andnot_si256(b, a); +} +template +EIGEN_STRONG_INLINE Packet4l plogical_shift_right(Packet4l a) { + return _mm256_srli_epi64(a, N); +} +template +EIGEN_STRONG_INLINE Packet4l plogical_shift_left(Packet4l a) { + return _mm256_slli_epi64(a, N); +} +#ifdef EIGEN_VECTORIZE_AVX512FP16 +template +EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) { return _mm256_srai_epi64(a, N); } +#else +template +EIGEN_STRONG_INLINE std::enable_if_t< (N == 0), Packet4l> parithmetic_shift_right(Packet4l a) { + return a; +} +template +EIGEN_STRONG_INLINE std::enable_if_t< (N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) { + __m256i hi_word = _mm256_srai_epi32(a, N); + __m256i lo_word = _mm256_srli_epi64(a, N); + return _mm256_blend_epi32(hi_word, lo_word, 0b01010101); +} +template +EIGEN_STRONG_INLINE std::enable_if_t< (N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) { + __m256i hi_word = _mm256_srai_epi32(a, 31); + __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask)); + return _mm256_blend_epi32(hi_word, lo_word, 0b01010101); +} +template +EIGEN_STRONG_INLINE std::enable_if_t< (N == 63), Packet4l> parithmetic_shift_right(Packet4l a) { + return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask)); +} +template +EIGEN_STRONG_INLINE std::enable_if_t< (N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) { + return parithmetic_shift_right(a); +} +#endif +template <> +EIGEN_STRONG_INLINE Packet4l pload(const int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet4l ploadu(const int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); +} +// Loads 2 int64_ts from memory a returns the packet {a0, a0, a1, a1} +template <> +EIGEN_STRONG_INLINE Packet4l ploaddup(const int64_t* from) { + const Packet4l a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast(from))); + return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3)); +} +template <> +EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet4l& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet4l& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); +} +template <> +EIGEN_DEVICE_FUNC inline Packet4l pgather(const int64_t* from, Index stride) { + return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]); +} +template <> +EIGEN_DEVICE_FUNC inline void pscatter(int64_t* to, const Packet4l& from, Index stride) { + __m128i low = _mm256_extractf128_si256(from, 0); + to[stride * 0] = _mm_extract_epi64(low, 0); + to[stride * 1] = _mm_extract_epi64(low, 1); + + __m128i high = _mm256_extractf128_si256(from, 1); + to[stride * 2] = _mm_extract_epi64(high, 0); + to[stride * 3] = _mm_extract_epi64(high, 1); +} +template <> +EIGEN_STRONG_INLINE void pstore1(int64_t* to, const int64_t& a) { + Packet4l pa = pset1(a); + pstore(to, pa); +} +template <> +EIGEN_STRONG_INLINE int64_t pfirst(const Packet4l& a) { + return _mm_cvtsi128_si64(_mm256_castsi256_si128(a)); +} +template <> +EIGEN_STRONG_INLINE int64_t predux(const Packet4l& a) { + __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); + return _mm_extract_epi64(r, 0) + _mm_extract_epi64(r, 1); +} +#define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M) +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m256d T0 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 15); + __m256d T1 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 0); + __m256d T2 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 15); + __m256d T3 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 0); + + kernel.packet[1] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 32)); + kernel.packet[3] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 49)); + kernel.packet[0] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 32)); + kernel.packet[2] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 49)); +} +template <> +EIGEN_STRONG_INLINE Packet4l pmin(const Packet4l& a, const Packet4l& b) { + __m256i cmp = _mm256_cmpgt_epi64(a, b); + __m256i a_min = _mm256_andnot_si256(cmp, a); + __m256i b_min = _mm256_and_si256(cmp, b); + return Packet4l(_mm256_or_si256(a_min, b_min)); +} +template <> +EIGEN_STRONG_INLINE Packet4l pmax(const Packet4l& a, const Packet4l& b) { + __m256i cmp = _mm256_cmpgt_epi64(a, b); + __m256i a_min = _mm256_and_si256(cmp, a); + __m256i b_min = _mm256_andnot_si256(cmp, b); + return Packet4l(_mm256_or_si256(a_min, b_min)); +} +template <> +EIGEN_STRONG_INLINE Packet4l pabs(const Packet4l& a) { + Packet4l pz = pzero(a); + Packet4l cmp = _mm256_cmpgt_epi64(a, pz); + return psub(cmp, pxor(a, cmp)); +} +template <> +EIGEN_STRONG_INLINE Packet4l pmul(const Packet4l& a, const Packet4l& b) { + // 64-bit mul requires avx512, so do this with 32-bit multiplication + __m256i upper32_a = _mm256_srli_epi64(a, 32); + __m256i upper32_b = _mm256_srli_epi64(b, 32); + + // upper * lower + __m256i mul1 = _mm256_mul_epu32(upper32_a, b); + __m256i mul2 = _mm256_mul_epu32(upper32_b, a); + // Gives us both upper*upper and lower*lower + __m256i mul3 = _mm256_mul_epu32(a, b); + + __m256i high = _mm256_slli_epi64(_mm256_add_epi64(mul1, mul2), 32); + return _mm256_add_epi64(high, mul3); +} +#endif template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } @@ -256,10 +504,17 @@ template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { retu template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } -template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } -template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } - template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +#ifdef EIGEN_VECTORIZE_AVX512 +template <> +EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b, uint8_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF); + return _mm512_castps512_ps256(_mm512_maskz_add_ps( + mask, + _mm512_castps256_ps512(a), + _mm512_castps256_ps512(b))); +} +#endif template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) { #ifdef EIGEN_VECTORIZE_AVX2 @@ -271,6 +526,10 @@ template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return padd(pset1(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return padd(pset1(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet8i plset(const int& a) { return padd(pset1(a), (Packet8i)_mm256_set_epi32(7,6,5,4,3,2,1,0)); } + template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) { @@ -285,11 +544,17 @@ template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { - return _mm256_sub_ps(_mm256_set1_ps(0.0),a); + const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); } template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) { - return _mm256_sub_pd(_mm256_set1_pd(0.0),a); + const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL)); + return _mm256_xor_pd(a, mask); +} +template<> EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a) +{ + return psub(pzero(a), a); } template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } @@ -310,36 +575,58 @@ template<> EIGEN_STRONG_INLINE Packet8i pmul(const Packet8i& a, const template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, const Packet8i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by AVX"); - return pset1(0); + +template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& a, const Packet8i& b) +{ +#ifdef EIGEN_VECTORIZE_AVX512 + return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b))); +#else + Packet4i lo = pdiv(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + Packet4i hi = pdiv(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +#endif } #ifdef EIGEN_VECTORIZE_FMA -template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) - // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, - // and even register spilling with clang>=6.0 (bug 1637). - // Gcc stupidly generates a vfmadd132ps instruction. - // So let's enforce it to generate a vfmadd231ps instruction since the most common use - // case is to accumulate the result of the product. - Packet8f res = c; - __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - return _mm256_fmadd_ps(a,b,c); -#endif +template <> +EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { + return _mm256_fmadd_ps(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) - // see above - Packet4d res = c; - __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - return _mm256_fmadd_pd(a,b,c); -#endif +template <> +EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { + return _mm256_fmadd_pd(a, b, c); } + +template <> +EIGEN_STRONG_INLINE Packet8f pmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) { + return _mm256_fmsub_ps(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet4d pmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) { + return _mm256_fmsub_pd(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet8f pnmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { + return _mm256_fnmadd_ps(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet4d pnmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { + return _mm256_fnmadd_pd(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet8f pnmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) { + return _mm256_fnmsub_ps(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet4d pnmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) { + return _mm256_fnmsub_pd(a, b, c); +} + #endif template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } @@ -352,7 +639,26 @@ template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); } template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } - +template<> EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_xor_si256(_mm256_cmpgt_epi32(a,b), _mm256_set1_epi32(-1)); +#else + __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + lo = _mm_xor_si128(lo, _mm_set1_epi32(-1)); + __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + hi = _mm_xor_si128(hi, _mm_set1_epi32(-1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} +template<> EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpgt_epi32(b,a); +#else + __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0)); + __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { #ifdef EIGEN_VECTORIZE_AVX2 return _mm256_cmpeq_epi32(a,b); @@ -388,6 +694,15 @@ template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const return _mm256_min_pd(b,a); #endif } +template<> EIGEN_STRONG_INLINE Packet8i pmin(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_min_epi32(a, b); +#else + __m128i lo = _mm_min_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_min_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 @@ -411,6 +726,21 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const return _mm256_max_pd(b,a); #endif } +template<> EIGEN_STRONG_INLINE Packet8i pmax(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_max_epi32(a, b); +#else + __m128i lo = _mm_max_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_max_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) { + return _mm256_sign_epi32(_mm256_set1_epi32(1), a); +} +#endif // Add specializations for min/max with prescribed NaN progation. template<> @@ -583,11 +913,16 @@ template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { E template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from, uint8_t umask) { +#ifdef EIGEN_VECTORIZE_AVX512 + __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from)); +#else Packet8i mask = _mm256_set1_epi8(static_cast(umask)); const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); mask = por(mask, bit_mask); mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask); +#endif } // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} @@ -605,12 +940,26 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // then we can perform a consistent permutation on the global register to get everything in shape: return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); } -// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} +// Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) { Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); return _mm256_permute_pd(tmp, 3<<2); } +// Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3} +template<> EIGEN_STRONG_INLINE Packet8i ploaddup(const int* from) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + const Packet8i a = _mm256_castsi128_si256(ploadu(from)); + return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3)); +#else + __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + // mimic an "inplace" permutation of the lower 128bits using a blend + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + // then we can perform a consistent permutation on the global register to get everything in shape: + return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2))); +#endif +} // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) @@ -618,6 +967,10 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); } +template<> EIGEN_STRONG_INLINE Packet8i ploadquad(const int* from) +{ + return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1); +} template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } @@ -628,11 +981,16 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { +#ifdef EIGEN_VECTORIZE_AVX512 + __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF); + EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from)); +#else Packet8i mask = _mm256_set1_epi8(static_cast(umask)); const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); mask = por(mask, bit_mask); mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +#endif } // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available @@ -646,6 +1004,11 @@ template<> EIGEN_DEVICE_FUNC inline Packet4d pgather(const dou { return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); } +template<> EIGEN_DEVICE_FUNC inline Packet8i pgather(const int* from, Index stride) +{ + return _mm256_set_epi32(from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet8f& from, Index stride) { @@ -670,6 +1033,20 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, to[stride*2] = _mm_cvtsd_f64(high); to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); } +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet8i& from, Index stride) +{ + __m128i low = _mm256_extractf128_si256(from, 0); + to[stride*0] = _mm_extract_epi32(low, 0); + to[stride*1] = _mm_extract_epi32(low, 1); + to[stride*2] = _mm_extract_epi32(low, 2); + to[stride*3] = _mm_extract_epi32(low, 3); + + __m128i high = _mm256_extractf128_si256(from, 1); + to[stride*4] = _mm_extract_epi32(high, 0); + to[stride*5] = _mm_extract_epi32(high, 1); + to[stride*6] = _mm_extract_epi32(high, 2); + to[stride*7] = _mm_extract_epi32(high, 3); +} template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { @@ -720,6 +1097,17 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) return _mm256_permute_pd(swap_halves,5); #endif } +template<> EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a) +{ + return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a))); +} + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a) +{ + return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a))); +} +#endif // pabs should be ok template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) @@ -732,6 +1120,23 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); return _mm256_and_pd(a,mask); } +template<> EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_abs_epi32(a); +#else + __m128i lo = _mm_abs_epi32(_mm256_extractf128_si256(a, 0)); + __m128i hi = _mm_abs_epi32(_mm256_extractf128_si256(a, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) { return _mm_srai_epi16(a, 15); } +template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return _mm_srai_epi16(a, 15); } +template<> EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) { return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a))); } +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) { return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a))); } +#endif template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { return pfrexp_generic(a,exponent); @@ -803,11 +1208,19 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) { return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); } +template<> EIGEN_STRONG_INLINE int predux(const Packet8i& a) +{ + return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1)))); +} template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); } +template<> EIGEN_STRONG_INLINE Packet4i predux_half_dowto4(const Packet8i& a) +{ + return _mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1)); +} template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) { @@ -856,7 +1269,12 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) { - return _mm256_movemask_ps(x)!=0; + return _mm256_movemask_ps(x) != 0; +} + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) +{ + return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0; } EIGEN_DEVICE_FUNC inline void @@ -905,6 +1323,66 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); } +#define MM256_SHUFFLE_EPI32(A, B, M) \ + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M)) + +#ifndef EIGEN_VECTORIZE_AVX2 +#define MM256_UNPACKLO_EPI32(A, B) \ + _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B))) +#define MM256_UNPACKHI_EPI32(A, B) \ + _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B))) +#else +#define MM256_UNPACKLO_EPI32(A, B) _mm256_unpacklo_epi32(A, B) +#define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B) +#endif + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]); + __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]); + __m256i T4 = MM256_UNPACKLO_EPI32(kernel.packet[4], kernel.packet[5]); + __m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]); + __m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]); + __m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]); + __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2)); + __m256i S4 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(1,0,1,0)); + __m256i S5 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(3,2,3,2)); + __m256i S6 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(1,0,1,0)); + __m256i S7 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20); + kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20); + kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20); + kernel.packet[3] = _mm256_permute2f128_si256(S3, S7, 0x20); + kernel.packet[4] = _mm256_permute2f128_si256(S0, S4, 0x31); + kernel.packet[5] = _mm256_permute2f128_si256(S1, S5, 0x31); + kernel.packet[6] = _mm256_permute2f128_si256(S2, S6, 0x31); + kernel.packet[7] = _mm256_permute2f128_si256(S3, S7, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]); + __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]); + __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]); + + __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2)); + + kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20); + kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20); + kernel.packet[2] = _mm256_permute2f128_si256(S0, S1, 0x31); + kernel.packet[3] = _mm256_permute2f128_si256(S2, S3, 0x31); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); @@ -919,21 +1397,37 @@ ptranspose(PacketBlock& kernel) { } template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { +#ifdef EIGEN_VECTORIZE_AVX2 + const __m256i zero = _mm256_setzero_si256(); + const __m256i select = _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256i false_mask = _mm256_cmpeq_epi32(zero, select); + return _mm256_blendv_ps(thenPacket, elsePacket, _mm256_castsi256_ps(false_mask)); +#else const __m256 zero = _mm256_setzero_ps(); const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +#endif } + template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { +#ifdef EIGEN_VECTORIZE_AVX2 + const __m256i zero = _mm256_setzero_si256(); + const __m256i select = _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256i false_mask = _mm256_cmpeq_epi64(select, zero); + return _mm256_blendv_pd(thenPacket, elsePacket, _mm256_castsi256_pd(false_mask)); +#else const __m256d zero = _mm256_setzero_pd(); const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +#endif } // Packet math for Eigen::half - +#ifndef EIGEN_VECTORIZE_AVX512FP16 template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; }; +#endif template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { return _mm_set1_epi16(numext::bit_cast(from)); @@ -989,18 +1483,9 @@ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { #ifdef EIGEN_HAS_FP16_C return _mm256_cvtph_ps(a); #else - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - - return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); + Eigen::internal::Packet8f pp = _mm256_castsi256_ps(_mm256_insertf128_si256( + _mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1)); + return pp; #endif } @@ -1008,17 +1493,9 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #ifdef EIGEN_HAS_FP16_C return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); #else - EIGEN_ALIGN32 float aux[8]; - pstore(aux, a); - const numext::uint16_t s0 = numext::bit_cast(Eigen::half(aux[0])); - const numext::uint16_t s1 = numext::bit_cast(Eigen::half(aux[1])); - const numext::uint16_t s2 = numext::bit_cast(Eigen::half(aux[2])); - const numext::uint16_t s3 = numext::bit_cast(Eigen::half(aux[3])); - const numext::uint16_t s4 = numext::bit_cast(Eigen::half(aux[4])); - const numext::uint16_t s5 = numext::bit_cast(Eigen::half(aux[5])); - const numext::uint16_t s6 = numext::bit_cast(Eigen::half(aux[6])); - const numext::uint16_t s7 = numext::bit_cast(Eigen::half(aux[7])); - return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); + __m128i lo = float2half(_mm256_extractf128_ps(a, 0)); + __m128i hi = float2half(_mm256_extractf128_ps(a, 1)); + return _mm_packus_epi32(lo, hi); #endif } @@ -1097,6 +1574,7 @@ template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { return _mm_xor_si128(a, sign_mask); } +#ifndef EIGEN_VECTORIZE_AVX512FP16 template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -1124,6 +1602,7 @@ template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8f rf = pdiv(af, bf); return float2half(rf); } +#endif template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) { @@ -1152,11 +1631,14 @@ template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to[stride*7] = aux[7]; } + +#ifndef EIGEN_VECTORIZE_AVX512FP16 template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { Packet8f af = half2float(a); float reduced = predux(af); return Eigen::half(reduced); } +#endif template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { Packet8f af = half2float(a); @@ -1272,7 +1754,6 @@ EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) { // Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm. EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { - Packet8bf r; __m256i input = _mm256_castps_si256(a); diff --git a/libs/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h index d507fb6..320479b 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TYPE_CASTING_AVX_H #define EIGEN_TYPE_CASTING_AVX_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/Complex.h b/libs/eigen/Eigen/src/Core/arch/AVX512/Complex.h index 49c72b3..6d8ee2b 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX512/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/Complex.h @@ -10,6 +10,8 @@ #ifndef EIGEN_COMPLEX_AVX512_H #define EIGEN_COMPLEX_AVX512_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -97,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) { - return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) @@ -157,11 +161,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) { - Packet8cf num = pmul(a, pconj(b)); - __m512 tmp = _mm512_mul_ps(b.v, b.v); - __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1); - __m512 denom = _mm512_add_ps(tmp, tmp2); - return Packet8cf(_mm512_div_ps(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& x) @@ -253,11 +253,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) { - #ifdef EIGEN_VECTORIZE_AVX512DQ - return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); - #else return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); - #endif } template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { @@ -309,47 +305,11 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d) template<> EIGEN_STRONG_INLINE Packet4cd pdiv(const Packet4cd& a, const Packet4cd& b) { - Packet4cd num = pmul(a, pconj(b)); - __m512d tmp = _mm512_mul_pd(b.v, b.v); - __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp); - return Packet4cd(_mm512_div_pd(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x) diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h b/libs/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h new file mode 100644 index 0000000..cb7cfdf --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h @@ -0,0 +1,1235 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2022 Intel Corporation +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H +#define EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H + +#if EIGEN_COMP_MSVC +#include +#else +#include +#endif +#include +#include + +#include "../../InternalHeaderCheck.h" + +#if !defined(EIGEN_USE_AVX512_GEMM_KERNELS) +// Disable new AVX512 kernels by default. +#define EIGEN_USE_AVX512_GEMM_KERNELS 0 +#endif + +#define SECOND_FETCH (32) +#if (EIGEN_COMP_GNUC_STRICT != 0) && !defined(EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS) +// Use less registers to load A elements to workaround compiler spills. Loose a +// bit of performance (less than ~2%). +#define EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS +#endif + +namespace Eigen { +namespace internal { + +template +class gemm_class { + using vec = typename packet_traits::type; + using vec_ymm = typename unpacket_traits::half; + using vec_xmm = typename unpacket_traits::half; + using umask_t = typename unpacket_traits::mask_t; + + static constexpr bool is_f32 = sizeof(Scalar) == sizeof(float); + static constexpr bool is_f64 = sizeof(Scalar) == sizeof(double); + +#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS + static constexpr bool use_less_a_regs = !is_unit_inc; +#else + static constexpr bool use_less_a_regs = true; +#endif +#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_B_REGS + static constexpr bool use_less_b_regs = !is_unit_inc; +#else + static constexpr bool use_less_b_regs = true; +#endif + + static constexpr int a_regs[] = {0, 1, 2, use_less_a_regs ? 0 : 3, use_less_a_regs ? 1 : 4, use_less_a_regs ? 2 : 5}; + static constexpr int b_regs[] = {6, use_less_b_regs ? 6 : 7}; + static constexpr int c_regs[] = { + 8, 16, 24, 9, 17, 25, 10, 18, 26, 11, 19, 27, 12, 20, 28, 13, 21, 29, 14, 22, 30, 15, 23, 31, + }; + + static constexpr int alpha_load_reg = 0; + static constexpr int c_load_regs[] = {1, 2, 6}; + + static constexpr int a_shift = 128; + static constexpr int b_shift = 128; + + static constexpr int nelems_in_cache_line = is_f32 ? 16 : 8; + static constexpr int a_prefetch_size = nelems_in_cache_line * 2; + static constexpr int b_prefetch_size = nelems_in_cache_line * 8; + + vec zmm[32]; + umask_t mask; + + // gemm arguments. + Index m; + const Index n, k, ldc; + const Index inc; + const Scalar *alpha; + + const Scalar *a, *b; + Scalar *c; + + const bool is_alpha1; + const bool is_beta0; + + const Index a_stride, b_stride; + const Index a_off, b_off; + + static EIGEN_ALWAYS_INLINE constexpr int div_up(int a, int b) { return a == 0 ? 0 : (a - 1) / b + 1; } + + EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) { + _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0); + } + + EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar *b_addr) { + _mm_prefetch((char *)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0); + } + + EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar *x_addr) { _mm_prefetch((char *)(x_addr - a_shift), _MM_HINT_T2); } + + EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar *c_addr) { +#if defined(__PRFCHW__) && __PRFCHW__ == 1 + _m_prefetchw((void *)c_addr); +#else + _mm_prefetch((char *)c_addr, _MM_HINT_T0); +#endif + } + + template + EIGEN_ALWAYS_INLINE void a_load(vec &a_reg, const Scalar *a_addr) { + switch (nelems * sizeof(*a_addr) * 8) { + default: + case 512 * 3: + a_reg = ploadu(a_addr); + break; + case 512 * 2: + a_reg = ploadu(a_addr); + break; + case 512 * 1: + a_reg = ploadu(a_addr); + break; + case 256 * 1: + a_reg = preinterpret(_mm512_broadcast_f64x4(ploadu(reinterpret_cast(a_addr)))); + break; + case 128 * 1: + a_reg = preinterpret(_mm512_broadcast_f32x4(ploadu(reinterpret_cast(a_addr)))); + break; + case 64 * 1: + a_reg = preinterpret(pload1(reinterpret_cast(a_addr))); + break; + case 32 * 1: + a_reg = pload1(a_addr); + break; + } + } + + EIGEN_ALWAYS_INLINE void b_load(vec &b_reg, const Scalar *b_addr) { b_reg = pload1(b_addr); } + + template + EIGEN_ALWAYS_INLINE void c_store(Scalar *mem, vec &src) { + if (is_unit_inc) { + switch (nelems * sizeof(*mem) * 8) { + default: + case 512 * 3: + pstoreu(mem, src); + break; + case 512 * 2: + pstoreu(mem, src); + break; + case 512 * 1: + pstoreu(mem, src); + break; + case 256 * 1: + pstoreu(mem, preinterpret(src)); + break; + case 128 * 1: + pstoreu(mem, preinterpret(src)); + break; + case 64 * 1: + pstorel(mem, preinterpret(src)); + break; + case 32 * 1: + pstores(mem, preinterpret(src)); + break; + } + } else { + switch (nelems * sizeof(*mem) * 8) { + default: + case 512 * 3: + pscatter(mem, src, inc); + break; + case 512 * 2: + pscatter(mem, src, inc); + break; + case 512 * 1: + pscatter(mem, src, inc); + break; + case 256 * 1: + pscatter(mem, src, inc, mask); + break; + case 128 * 1: + pscatter(mem, src, inc, mask); + break; + case 64 * 1: + pscatter(mem, src, inc, mask); + break; + case 32 * 1: + pscatter(mem, src, inc, mask); + break; + } + } + } + + template + EIGEN_ALWAYS_INLINE void vaddm(vec &dst, const Scalar *mem, vec &src, vec ®) { + if (is_unit_inc) { + switch (nelems * sizeof(*mem) * 8) { + default: + case 512 * 3: + dst = padd(src, ploadu(mem)); + break; + case 512 * 2: + dst = padd(src, ploadu(mem)); + break; + case 512 * 1: + dst = padd(src, ploadu(mem)); + break; + case 256 * 1: + dst = preinterpret(padd(preinterpret(src), ploadu(mem))); + break; + case 128 * 1: + dst = preinterpret(padd(preinterpret(src), ploadu(mem))); + break; + case 64 * 1: + dst = preinterpret(padd(preinterpret(src), ploadl(mem))); + break; + case 32 * 1: + dst = preinterpret(padds(preinterpret(src), ploads(mem))); + break; + } + } else { + // Zero out scratch register + reg = pzero(reg); + + switch (nelems * sizeof(*mem) * 8) { + default: + case 512 * 3: + reg = pgather(mem, inc); + dst = padd(src, reg); + break; + case 512 * 2: + reg = pgather(mem, inc); + dst = padd(src, reg); + break; + case 512 * 1: + reg = pgather(mem, inc); + dst = padd(src, reg); + break; + case 256 * 1: + reg = preinterpret(pgather(mem, inc)); + dst = preinterpret(padd(preinterpret(src), preinterpret(reg))); + break; + case 128 * 1: + reg = preinterpret(pgather(mem, inc)); + dst = preinterpret(padd(preinterpret(src), preinterpret(reg))); + break; + case 64 * 1: + if (is_f32) { + reg = pgather(reg, mem, inc, mask); + dst = preinterpret(padd(preinterpret(src), preinterpret(reg))); + } else { + dst = preinterpret(padd(preinterpret(src), ploadl(mem))); + } + break; + case 32 * 1: + dst = preinterpret(padds(preinterpret(src), ploads(mem))); + break; + } + } + } + + EIGEN_STRONG_INLINE void vfmadd(vec &dst, const vec &src1, const vec &src2) { + dst = pmadd(src1, src2, dst); + +#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0) + // Workaround register spills for gcc and clang + __asm__("#" : [dst] "+v"(dst) : [src1] "%v"(src1), [src2] "v"(src2)); +#endif + } + + template + EIGEN_ALWAYS_INLINE void vfmaddm(vec &dst, const Scalar *mem, vec &src, vec &scale, vec ®) { + if (is_unit_inc) { + switch (nelems * sizeof(*mem) * 8) { + default: + case 512 * 3: + dst = pmadd(scale, src, ploadu(mem)); + break; + case 512 * 2: + dst = pmadd(scale, src, ploadu(mem)); + break; + case 512 * 1: + dst = pmadd(scale, src, ploadu(mem)); + break; + case 256 * 1: + dst = + preinterpret(pmadd(preinterpret(scale), preinterpret(src), ploadu(mem))); + break; + case 128 * 1: + dst = + preinterpret(pmadd(preinterpret(scale), preinterpret(src), ploadu(mem))); + break; + case 64 * 1: + dst = + preinterpret(pmadd(preinterpret(scale), preinterpret(src), ploadl(mem))); + break; + case 32 * 1: + dst = + preinterpret(pmadds(preinterpret(scale), preinterpret(src), ploads(mem))); + break; + } + } else { + // Zero out scratch register + reg = pzero(reg); + + switch (nelems * sizeof(*mem) * 8) { + default: + case 512 * 3: + reg = pgather(mem, inc); + dst = pmadd(scale, src, reg); + break; + case 512 * 2: + reg = pgather(mem, inc); + dst = pmadd(scale, src, reg); + break; + case 512 * 1: + reg = pgather(mem, inc); + dst = pmadd(scale, src, reg); + break; + case 256 * 1: + reg = preinterpret(pgather(mem, inc)); + dst = preinterpret( + pmadd(preinterpret(scale), preinterpret(src), preinterpret(reg))); + break; + case 128 * 1: + reg = preinterpret(pgather(mem, inc)); + dst = preinterpret( + pmadd(preinterpret(scale), preinterpret(src), preinterpret(reg))); + break; + case 64 * 1: + if (is_f32) { + reg = pgather(reg, mem, inc, mask); + dst = preinterpret( + pmadd(preinterpret(scale), preinterpret(src), preinterpret(reg))); + } else { + dst = preinterpret( + pmadd(preinterpret(scale), preinterpret(src), ploadl(mem))); + } + break; + case 32 * 1: + dst = + preinterpret(pmadds(preinterpret(scale), preinterpret(src), ploads(mem))); + break; + } + } + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar *ao) { + EIGEN_UNUSED_VARIABLE(ao); + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar *ao) { + if (j < endX) { + if (i < endY) { + auto &a_reg = zmm[a_regs[i + (j % 2) * 3]]; + const Scalar *a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift; + a_load(a_reg, a_addr); + + a_loads(ao); + } else { + a_loads(ao); + } + } + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar *co1, + const Scalar *co2) { + EIGEN_UNUSED_VARIABLE(co1); + EIGEN_UNUSED_VARIABLE(co2); + } + + /* C prefetch loop structure. + * for (int un = 0; un < 8; un++) { + * if (b_unroll >= un + 1) { + * if (un == 4) co2 = co1 + 4 * ldc; + * + * for (int i = 0; i < um_vecs; i++) { + * Scalar *co = (un + 1 <= 4) ? co1 : co2; + * auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co; + * prefetch_c(co + co_off); + * } + * } + * } + */ + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar *&co1, Scalar *&co2) { + if (un < max_b_unroll) { + if (b_unroll >= un + 1) { + if (un == 4 && i == 0) co2 = co1 + 4 * ldc; + + if (i < um_vecs) { + Scalar *co = (un + 1 <= 4) ? co1 : co2; + auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co; + prefetch_c(co + co_off); + + prefetch_cs(co1, co2); + } else { + prefetch_cs(co1, co2); + } + + } else { + prefetch_cs(co1, co2); + } + } + } + + // load_c + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) { + EIGEN_UNUSED_VARIABLE(cox); + EIGEN_UNUSED_VARIABLE(alpha_reg); + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) { + if (i < um_vecs) { + auto &c_reg = zmm[c_regs[i + idx * 3]]; + auto &c_load_reg = zmm[c_load_regs[i % 3]]; + auto c_mem = cox; + if (is_unit_inc) + c_mem += i * nelems_in_cache_line; + else + c_mem += i * nelems_in_cache_line * inc; + + if (!is_beta0 && is_alpha1) + vaddm(c_reg, c_mem, c_reg, c_load_reg); + else if (!is_beta0 && !is_alpha1) + vfmaddm(c_reg, c_mem, c_reg, alpha_reg, c_load_reg); + else if (is_beta0 && !is_alpha1) + c_reg = pmul(alpha_reg, c_reg); + + scale_load_c(cox, alpha_reg); + } + } + + // store_c + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar *cox) { + EIGEN_UNUSED_VARIABLE(cox); + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar *cox) { + if (i < um_vecs) { + auto &c_reg = zmm[c_regs[i + idx * 3]]; + auto c_mem = cox; + if (is_unit_inc) + c_mem += i * nelems_in_cache_line; + else + c_mem += i * nelems_in_cache_line * inc; + + c_store(c_mem, c_reg); + c_reg = pzero(c_reg); + + write_c(cox); + } + } + + /* C update loop structure. + * co2 = co1 + ldc; + * + * auto &alpha_reg = zmm[alpha_load_reg]; + * if (!is_alpha1) alpha_reg = pload1(alpha); + * + * int idx = 0; + * for (pow = 1; pow <= 8; pow <<= 1) { + * + * if (b_unroll >= pow) { + * for (count = 1; count < (pow + 1) / 2 + 1; count++) { + * if (pow >= 4) co2 += ldc; + * + * const Scalar *cox = (idx == 0) ? co1 : co2; + * + * const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + * scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); + * write_c<0, um_vecs, idx, a_unroll>(cox); + * + * idx++; + * } + * } + * } + * + * if (b_unroll == 1) + * co1 += ldc; + * else + * co1 = co2 + ldc; + */ + + template + EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) { + if (pow >= 4) cox += ldc; + + const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + auto &alpha_reg = zmm[alpha_load_reg]; + + scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); + write_c<0, um_vecs, idx, a_unroll>(cox); + } + + template + EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar *&co1, Scalar *&co2) { + constexpr int idx = pow / 2; + Scalar *&cox = idx == 0 ? co1 : co2; + + constexpr int max_count = (pow + 1) / 2; + static_assert(max_count <= 4, "Unsupported max_count."); + + if (1 <= max_count) c_update_1count(cox); + if (2 <= max_count) c_update_1count(cox); + if (3 <= max_count) c_update_1count(cox); + if (4 <= max_count) c_update_1count(cox); + } + + template + EIGEN_ALWAYS_INLINE void c_update(Scalar *&co1, Scalar *&co2) { + auto &alpha_reg = zmm[alpha_load_reg]; + + co2 = co1 + ldc; + if (!is_alpha1) alpha_reg = pload1(alpha); + if (!is_unit_inc && a_unroll < nelems_in_cache_line) mask = static_cast((1ull << a_unroll) - 1); + + static_assert(max_b_unroll <= 8, "Unsupported max_b_unroll"); + + if (1 <= max_b_unroll && 1 <= b_unroll) c_update_1pow<1, a_unroll>(co1, co2); + if (2 <= max_b_unroll && 2 <= b_unroll) c_update_1pow<2, a_unroll>(co1, co2); + if (4 <= max_b_unroll && 4 <= b_unroll) c_update_1pow<4, a_unroll>(co1, co2); + if (8 <= max_b_unroll && 8 <= b_unroll) c_update_1pow<8, a_unroll>(co1, co2); + + if (b_unroll == 1) + co1 += ldc; + else + co1 = co2 + ldc; + } + + // compute + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx, + int &fetchB_idx, vec &b_reg) { + EIGEN_UNUSED_VARIABLE(ao); + EIGEN_UNUSED_VARIABLE(bo); + EIGEN_UNUSED_VARIABLE(fetchA_idx); + EIGEN_UNUSED_VARIABLE(fetchB_idx); + EIGEN_UNUSED_VARIABLE(b_reg); + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx, + int &fetchB_idx, vec &b_reg) { + if (um < um_vecs) { + auto &c_reg = zmm[c_regs[um + idx * 3]]; + auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]]; + + vfmadd(c_reg, a_reg, b_reg); + + if (!fetch_x && um == 0 && + (((idx == 0 || idx == 6) && (uk % 2 == 0 || is_f64 || ktail)) || + (idx == 3 && (uk % 2 == 1 || is_f64 || ktail)))) { + prefetch_a(ao + nelems_in_cache_line * fetchA_idx); + fetchA_idx++; + } + + if (um == 0 && idx == 1 && (uk % 2 == 0 || is_f64 || ktail)) { + prefetch_b(bo + nelems_in_cache_line * fetchB_idx); + fetchB_idx++; + } + + compute(ao, bo, fetchA_idx, fetchB_idx, b_reg); + } + } + + // load_a + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar *ao) { + EIGEN_UNUSED_VARIABLE(ao); + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar *ao) { + if (um < um_vecs) { + auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]]; + const Scalar *a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift; + a_load(a_reg, a_addr); + + load_a(ao); + } + } + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa, + const Scalar *const &ao, + const Scalar *const &bo, Scalar *&co2, + int &fetchA_idx, int &fetchB_idx) { + EIGEN_UNUSED_VARIABLE(aa); + EIGEN_UNUSED_VARIABLE(ao); + EIGEN_UNUSED_VARIABLE(bo); + EIGEN_UNUSED_VARIABLE(co2); + EIGEN_UNUSED_VARIABLE(fetchA_idx); + EIGEN_UNUSED_VARIABLE(fetchB_idx); + } + + template + EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa, + const Scalar *const &ao, + const Scalar *const &bo, Scalar *&co2, + int &fetchA_idx, int &fetchB_idx) { + const int idx = (pow / 2) + count; + + if (count < (pow + 1) / 2) { + auto &b_reg = zmm[b_regs[idx % 2]]; + + if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa); + if (fetch_x && uk == 3 && idx == 4) aa += 8; + + if (b_unroll >= pow) { + compute<0, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg); + + const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift; + b_load(b_reg, b_addr); + } + + // Go to the next count. + innerkernel_1pow(aa, ao, bo, co2, fetchA_idx, + fetchB_idx); + + } else { + // Maybe prefetch C data after count-loop. + if (pow == 2 && c_fetch) { + if (uk % 3 == 0 && uk > 0) { + co2 += ldc; + } else { + prefetch_c(co2 + (uk % 3) * nelems_in_cache_line); + } + } + } + } + + template + EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo, + Scalar *&co2, int &fetchA_idx, int &fetchB_idx) { + const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + + if (max_b_unroll >= 1) + innerkernel_1pow(aa, ao, bo, co2, fetchA_idx, fetchB_idx); + if (max_b_unroll >= 2) + innerkernel_1pow(aa, ao, bo, co2, fetchA_idx, fetchB_idx); + if (max_b_unroll >= 4) + innerkernel_1pow(aa, ao, bo, co2, fetchA_idx, fetchB_idx); + if (max_b_unroll >= 8) + innerkernel_1pow(aa, ao, bo, co2, fetchA_idx, fetchB_idx); + + // Load A after pow-loop. + load_a<0, um_vecs, uk, a_unroll, ktail>(ao); + } + + /* Inner kernel loop structure. + * for (int uk = 0; uk < kfactor; uk++) { + * int idx = 0; + * + * for (pow = 1; pow < max_b_unroll << 1; pow <<= 1) { + * for (int count = 0; count < (pow + 1) / 2; count++) { + * auto &b_reg = zmm[b_regs[idx % 2]]; + * + * if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa); + * if (fetch_x && uk == 3 && idx == 4) aa += 8; + * + * if (b_unroll >= pow) { + * compute<0, um_vecs, idx, uk, fetchx, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg); + * + * const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) - b_shift ; + * b_load(b_reg, b_addr); + * } + * idx++; + * } + * + * Maybe prefetch C data. + * if (pow == 2 && c_fetch) { + * if (uk % 3 == 0 && uk > 0) { + * co2 += ldc; + * } else { + * prefetch_c(co2 + (uk % 3) * nelems_in_cache_line); + * } + * } + * } + * + * Load A. + * load_a<0, um_vecs, uk, ktail, a_unroll>(ao); + * } + * + * Advance A/B pointers after uk-loop. + * ao += a_unroll * kfactor; + * bo += b_unroll * kfactor; + */ + + template + EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) { + int fetchA_idx = 0; + int fetchB_idx = 0; + + const bool fetch_x = k_factor == max_k_factor; + const bool ktail = k_factor == 1; + + static_assert(k_factor <= 4 && k_factor > 0, "innerkernel maximum k_factor supported is 4"); + + if (k_factor > 0) + innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, + fetchB_idx); + if (k_factor > 1) + innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, + fetchB_idx); + if (k_factor > 2) + innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, + fetchB_idx); + if (k_factor > 3) + innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, + fetchB_idx); + + // Advance A/B pointers after uk-loop. + ao += a_unroll * k_factor; + bo += b_unroll * k_factor; + } + + template + EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) { + const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + if (!use_less_a_regs) + a_loads<0, 2, 0, um_vecs, a_unroll>(ao); + else + a_loads<0, 1, 0, um_vecs, a_unroll>(ao); + + b_load(zmm[b_regs[0]], bo - b_shift + 0); + if (!use_less_b_regs) b_load(zmm[b_regs[1]], bo - b_shift + 1); + +#ifndef SECOND_FETCH + prefetch_cs<0, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2); +#endif // SECOND_FETCH + + // Unrolling k-loop by a factor of 4. + const int max_k_factor = 4; + Index loop_count = k / max_k_factor; + + if (loop_count > 0) { +#ifdef SECOND_FETCH + loop_count -= SECOND_FETCH; +#endif + while (loop_count > 0) { + innerkernel(aa, ao, bo, co2); + loop_count--; + } +#ifdef SECOND_FETCH + co2 = co1 + nelems_in_cache_line - 1; + + loop_count += b_unroll; + while (loop_count > 0) { + innerkernel(aa, ao, bo, co2); + loop_count--; + } + + loop_count += SECOND_FETCH - b_unroll; + while (loop_count > 0) { + innerkernel(aa, ao, bo, co2); + loop_count--; + } +#endif + } + + // k-loop remainder handling. + loop_count = k % max_k_factor; + while (loop_count > 0) { + innerkernel(aa, ao, bo, co2); + loop_count--; + } + + // Update C matrix. + c_update(co1, co2); + } + + template + EIGEN_ALWAYS_INLINE void nloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) { + // Set A matrix pointer. + ao = a + a_off * a_unroll; + + // Set B matrix pointer if needed. + bo += b_unroll * b_off; + + kloop(aa, ao, bo, co1, co2); + + // Advance B matrix pointer if needed. + bo += b_unroll * (b_stride - k - b_off); + + // Advance prefetch A pointer. + aa += 16; + } + + template + EIGEN_ALWAYS_INLINE void mloop(const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) { + // Set prefetch A pointers. + const Scalar *aa = a + a_unroll * a_stride; + + // Set C matrix pointers. + co1 = c; + if (a_unroll >= max_a_unroll) co2 = c + 2 * ldc; + if (is_unit_inc) + c += a_unroll; + else + c += a_unroll * inc; + + // Set B matrix pointer. + bo = b; + + // Main n-loop. + for (Index i = n / max_b_unroll; i > 0; i--) nloop(aa, ao, bo, co1, co2); + + // n-remainders. + if (n & 4 && max_b_unroll > 4) nloop(aa, ao, bo, co1, co2); +#if 0 + if (n & 2 && max_b_unroll > 2) nloop(aa, ao, bo, co1, co2); + if (n & 1 && max_b_unroll > 1) nloop(aa, ao, bo, co1, co2); +#else + // Copy kernels don't support tails of n = 2 for single/double precision. + // Loop over ones. + int n_rem = 2 * ((n & 2) != 0) + 1 * ((n & 1) != 0); + while (n_rem > 0) { + nloop(aa, ao, bo, co1, co2); + n_rem--; + } +#endif + + // Advance A matrix pointer. + a = ao + a_unroll * (a_stride - k - a_off); + } + + public: + // Compute kernel unrolling C matrix by max_a_unroll x max_b_unroll. + template + EIGEN_ALWAYS_INLINE void compute_kern() { + a -= -a_shift; + b -= -b_shift; + + const Scalar *ao = nullptr; + const Scalar *bo = nullptr; + Scalar *co1 = nullptr; + Scalar *co2 = nullptr; + + // Main m-loop. + for (; m >= max_a_unroll; m -= max_a_unroll) mloop(ao, bo, co1, co2); + + // m-remainders. + if (m & 32 && max_a_unroll > 32) mloop<32, max_a_unroll, max_b_unroll>(ao, bo, co1, co2); + if (m & 16 && max_a_unroll > 16) mloop<16, max_a_unroll, max_b_unroll>(ao, bo, co1, co2); + if (m & 8 && max_a_unroll > 8) mloop<8, max_a_unroll, max_b_unroll>(ao, bo, co1, co2); + if (m & 4 && max_a_unroll > 4) mloop<4, max_a_unroll, max_b_unroll>(ao, bo, co1, co2); + if (m & 2 && max_a_unroll > 2 && is_f64) mloop<2, max_a_unroll, max_b_unroll>(ao, bo, co1, co2); + if (m & 1 && max_a_unroll > 1 && is_f64) mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2); + + // Copy kernels don't support tails of m = 2 for single precision. + // Loop over ones. + if (is_f32) { + int m_rem = 2 * ((m & 2) != 0) + 1 * ((m & 1) != 0); + while (m_rem > 0) { + mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2); + m_rem--; + } + } + } + + gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar *alpha_, const Scalar *a_, + const Scalar *b_, Scalar *c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_, + Index a_off_, Index b_off_) + : m(m_), + n(n_), + k(k_), + ldc(ldc_), + inc(inc_), + alpha(alpha_), + a(a_), + b(b_), + c(c_), + is_alpha1(is_alpha1_), + is_beta0(is_beta0_), + a_stride(a_stride_), + b_stride(b_stride_), + a_off(a_off_), + b_off(b_off_) { + // Zero out all accumulation registers. + zmm[8] = pzero(zmm[8]); + zmm[9] = pzero(zmm[9]); + zmm[10] = pzero(zmm[10]); + zmm[11] = pzero(zmm[11]); + zmm[12] = pzero(zmm[12]); + zmm[13] = pzero(zmm[13]); + zmm[14] = pzero(zmm[14]); + zmm[15] = pzero(zmm[15]); + zmm[16] = pzero(zmm[16]); + zmm[17] = pzero(zmm[17]); + zmm[18] = pzero(zmm[18]); + zmm[19] = pzero(zmm[19]); + zmm[20] = pzero(zmm[20]); + zmm[21] = pzero(zmm[21]); + zmm[22] = pzero(zmm[22]); + zmm[23] = pzero(zmm[23]); + zmm[24] = pzero(zmm[24]); + zmm[25] = pzero(zmm[25]); + zmm[26] = pzero(zmm[26]); + zmm[27] = pzero(zmm[27]); + zmm[28] = pzero(zmm[28]); + zmm[29] = pzero(zmm[29]); + zmm[30] = pzero(zmm[30]); + zmm[31] = pzero(zmm[31]); + } +}; + +// Compute kernel with max unroll support of: +// Single precision: +// max_a_unroll: 48, 32, 16, 8, 4, 2, 1 +// max_b_unroll: 8, 4, 2, 1 +// Double precision: +// max_a_unroll: 24, 16, 8, 4, 2, 1 +// max_b_unroll: 8, 4, 2, 1 +template +EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar *alpha, const Scalar *a, const Scalar *b, + Scalar *c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1, + Index a_off = 0, Index b_off = 0) { + if (a_stride == -1) a_stride = k; + if (b_stride == -1) b_stride = k; + + gemm_class g(m, n, k, ldc, inc, alpha, a, b, c, is_alpha1, is_beta0, a_stride, b_stride, a_off, + b_off); + g.template compute_kern(); +} + +// Template specializations of GEBP kernels with nr = 8. +#if EIGEN_USE_AVX512_GEMM_KERNELS +template +class gebp_traits + : public gebp_traits { + using Base = gebp_traits; + + public: + enum { nr = Base::Vectorizable ? 8 : 4 }; +}; + +template +class gebp_traits + : public gebp_traits { + using Base = gebp_traits; + + public: + enum { nr = Base::Vectorizable ? 8 : 4 }; +}; + +template +struct gemm_pack_rhs { + typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; + enum { PacketSize = packet_traits::size }; + EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0, + Index offset = 0); +}; + +template +EIGEN_DONT_INLINE void gemm_pack_rhs::operator()( + Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset) { + constexpr int nr = 8; + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride)); + conj_if::IsComplex && Conjugate> cj; + Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0; + Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0; + Index count = 0; + const Index peeled_k = (depth / PacketSize) * PacketSize; + if (nr >= 8) { + for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { + // skip what we have before + if (PanelMode) count += 8 * offset; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4); + const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5); + const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6); + const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7); + Index k = 0; + if ((PacketSize % 8) == 0) // TODO enable vectorized transposition for PacketSize==4 + { + for (; k < peeled_k; k += PacketSize) { + PacketBlock kernel; + + kernel.packet[0] = dm0.template loadPacket(k); + kernel.packet[1] = dm1.template loadPacket(k); + kernel.packet[2] = dm2.template loadPacket(k); + kernel.packet[3] = dm3.template loadPacket(k); + kernel.packet[4] = dm4.template loadPacket(k); + kernel.packet[5] = dm5.template loadPacket(k); + kernel.packet[6] = dm6.template loadPacket(k); + kernel.packet[7] = dm7.template loadPacket(k); + + ptranspose(kernel); + + pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize])); + pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize])); + pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize])); + pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel.packet[4 % PacketSize])); + pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel.packet[5 % PacketSize])); + pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel.packet[6 % PacketSize])); + pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel.packet[7 % PacketSize])); + count += 8 * PacketSize; + } + } + for (; k < depth; k++) { + blockB[count + 0] = cj(dm0(k)); + blockB[count + 1] = cj(dm1(k)); + blockB[count + 2] = cj(dm2(k)); + blockB[count + 3] = cj(dm3(k)); + blockB[count + 4] = cj(dm4(k)); + blockB[count + 5] = cj(dm5(k)); + blockB[count + 6] = cj(dm6(k)); + blockB[count + 7] = cj(dm7(k)); + count += 8; + } + // skip what we have after + if (PanelMode) count += 8 * (stride - offset - depth); + } + } + + if (nr >= 4) { + for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) { + // skip what we have before + if (PanelMode) count += 4 * offset; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k = 0; + if ((PacketSize % 4) == 0) // TODO enable vectorized transposition for PacketSize==2 ?? + { + for (; k < peeled_k; k += PacketSize) { + PacketBlock kernel; + kernel.packet[0] = dm0.template loadPacket(k); + kernel.packet[1 % PacketSize] = dm1.template loadPacket(k); + kernel.packet[2 % PacketSize] = dm2.template loadPacket(k); + kernel.packet[3 % PacketSize] = dm3.template loadPacket(k); + ptranspose(kernel); + pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize])); + pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize])); + pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize])); + count += 4 * PacketSize; + } + } + for (; k < depth; k++) { + blockB[count + 0] = cj(dm0(k)); + blockB[count + 1] = cj(dm1(k)); + blockB[count + 2] = cj(dm2(k)); + blockB[count + 3] = cj(dm3(k)); + count += 4; + } + // skip what we have after + if (PanelMode) count += 4 * (stride - offset - depth); + } + } + + // copy the remaining columns one at a time (nr==1) + for (Index j2 = packet_cols4; j2 < cols; ++j2) { + if (PanelMode) count += offset; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) { + blockB[count] = cj(dm0(k)); + count += 1; + } + if (PanelMode) count += (stride - offset - depth); + } +} + +template +struct gemm_pack_rhs { + typedef typename packet_traits::type Packet; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + typedef typename DataMapper::LinearMapper LinearMapper; + enum { + PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size + }; + EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0, + Index offset = 0) { + constexpr int nr = 8; + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride)); + const bool HasHalf = (int)HalfPacketSize < (int)PacketSize; + const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize; + conj_if::IsComplex && Conjugate> cj; + Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0; + Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0; + Index count = 0; + + if (nr >= 8) { + for (Index j2 = 0; j2 < packet_cols8; j2 += 8) { + // skip what we have before + if (PanelMode) count += 8 * offset; + for (Index k = 0; k < depth; k++) { + if (PacketSize == 8) { + // Packet A = ploadu(&rhs.data()[k*rhs.stride() + j2]); + Packet A = rhs.template loadPacket(k, j2); + pstoreu(blockB + count, cj.pconj(A)); + } else if (HasHalf && HalfPacketSize == 8) { + HalfPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB + count, cj.pconj(A)); + } else if (HasQuarter && QuarterPacketSize == 8) { + QuarterPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB + count, cj.pconj(A)); + } else if (PacketSize == 4) { + // Packet A = ploadu(&rhs.data()[k*rhs.stride() + j2]); + // Packet B = ploadu(&rhs.data()[k*rhs.stride() + j2 + PacketSize]); + Packet A = rhs.template loadPacket(k, j2); + Packet B = rhs.template loadPacket(k, j2 + PacketSize); + pstoreu(blockB + count, cj.pconj(A)); + pstoreu(blockB + count + PacketSize, cj.pconj(B)); + } else { + // const Scalar* b0 = &rhs.data()[k*rhs.stride() + j2]; + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count + 0] = cj(dm0(0)); + blockB[count + 1] = cj(dm0(1)); + blockB[count + 2] = cj(dm0(2)); + blockB[count + 3] = cj(dm0(3)); + blockB[count + 4] = cj(dm0(4)); + blockB[count + 5] = cj(dm0(5)); + blockB[count + 6] = cj(dm0(6)); + blockB[count + 7] = cj(dm0(7)); + } + count += 8; + } + // skip what we have after + if (PanelMode) count += 8 * (stride - offset - depth); + } + } + + if (nr >= 4) { + for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) { + // skip what we have before + if (PanelMode) count += 4 * offset; + for (Index k = 0; k < depth; k++) { + if (PacketSize == 4) { + Packet A = rhs.template loadPacket(k, j2); + pstoreu(blockB + count, cj.pconj(A)); + count += PacketSize; + } else if (HasHalf && HalfPacketSize == 4) { + HalfPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB + count, cj.pconj(A)); + count += HalfPacketSize; + } else if (HasQuarter && QuarterPacketSize == 4) { + QuarterPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB + count, cj.pconj(A)); + count += QuarterPacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count + 0] = cj(dm0(0)); + blockB[count + 1] = cj(dm0(1)); + blockB[count + 2] = cj(dm0(2)); + blockB[count + 3] = cj(dm0(3)); + count += 4; + } + } + // skip what we have after + if (PanelMode) count += 4 * (stride - offset - depth); + } + } + // copy the remaining columns one at a time (nr==1) + for (Index j2 = packet_cols4; j2 < cols; ++j2) { + if (PanelMode) count += offset; + for (Index k = 0; k < depth; k++) { + blockB[count] = cj(rhs(k, j2)); + count += 1; + } + if (PanelMode) count += stride - offset - depth; + } + } +}; + +template +struct gebp_kernel { + EIGEN_ALWAYS_INLINE + void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, + Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0, + Index offsetB = 0); +}; + +template +EIGEN_ALWAYS_INLINE void gebp_kernel::operator()( + const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols, + Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { + if (res.incr() == 1) { + if (alpha == 1) { + gemm_kern_avx512(rows, cols, depth, &alpha, blockA, blockB, + (Scalar *)res.data(), res.stride(), res.incr(), strideA, + strideB, offsetA, offsetB); + } else { + gemm_kern_avx512(rows, cols, depth, &alpha, blockA, blockB, + (Scalar *)res.data(), res.stride(), res.incr(), strideA, + strideB, offsetA, offsetB); + } + } else { + if (alpha == 1) { + gemm_kern_avx512(rows, cols, depth, &alpha, blockA, blockB, + (Scalar *)res.data(), res.stride(), res.incr(), strideA, + strideB, offsetA, offsetB); + } else { + gemm_kern_avx512(rows, cols, depth, &alpha, blockA, blockB, + (Scalar *)res.data(), res.stride(), res.incr(), strideA, + strideB, offsetA, offsetB); + } + } +} +#endif // EIGEN_USE_AVX512_GEMM_KERNELS + +} // namespace internal +} // namespace Eigen + +#undef SECOND_FETCH + +#endif // EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h index 6fd726d..af47a85 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -10,39 +10,40 @@ #ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ #define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { -// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 +#if EIGEN_HAS_AVX512_MATH -#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ +#define EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ +#define EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ const Packet16f p16f_##NAME = preinterpret(pset1(X)) -#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ +#define EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ const Packet8d p8d_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ +#define EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) -#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \ +#define EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \ const Packet16bf p16bf_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \ +#define EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \ const Packet16bf p16bf_##NAME = preinterpret(pset1(X)) template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f plog(const Packet16f& _x) { return plog_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d plog(const Packet8d& _x) { return plog_double(_x); } @@ -51,13 +52,13 @@ F16_PACKET_FUNCTION(Packet16f, Packet16h, plog) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog) template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f plog2(const Packet16f& _x) { return plog2_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d plog2(const Packet8d& _x) { return plog2_double(_x); } @@ -69,23 +70,23 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2) // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f pexp(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f); + EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); + EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); + EIGEN_DECLARE_CONST_Packet16f(127, 127.0f); - _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f); + EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f); + EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f); + EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f); + EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f); + EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f); + EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f); + EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f); + EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f); + EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f); // Clamp x. Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo); @@ -96,7 +97,7 @@ pexp(const Packet16f& _x) { // Get r = x - m*ln(2). Note that we can do this without losing more than one // ulp precision due to the FMA instruction. - _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); + EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x); Packet16f r2 = pmul(r, r); Packet16f r3 = pmul(r2, r); @@ -120,7 +121,7 @@ pexp(const Packet16f& _x) { } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d pexp(const Packet8d& _x) { return pexp_double(_x); } @@ -154,49 +155,18 @@ EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exp return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); } -// Functions for sqrt. -// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step -// of Newton's method, at a cost of 1-2 bits of precision as opposed to the -// exact solution. The main advantage of this approach is not just speed, but -// also the fact that it can be inlined and pipelined with other computations, -// further reducing its effective latency. #if EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt(const Packet16f& _x) { - Packet16f neg_half = pmul(_x, pset1(-.5f)); - __mmask16 denormal_mask = _mm512_kand( - _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - - Packet16f x = _mm512_rsqrt14_ps(_x); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); - - // Flush results for denormals to zero. - return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); + return generic_sqrt_newton_step::run(_x, _mm512_rsqrt14_ps(_x)); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt(const Packet8d& _x) { - Packet8d neg_half = pmul(_x, pset1(-.5)); - __mmask16 denormal_mask = _mm512_kand( - _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - - Packet8d x = _mm512_rsqrt14_pd(_x); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); - - // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); - - return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); + // Double requires 2 Newton-Raphson steps for convergence. + return generic_sqrt_newton_step::run(_x, _mm512_rsqrt14_pd(_x)); } #else template <> @@ -223,40 +193,9 @@ EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { #elif EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - - Packet16f neg_half = pmul(_x, p16f_minus_half); - - // Identity infinite, negative and denormal arguments. - __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ); - __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ); - __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask; - - // Compute an approximate result using the rsqrt intrinsic, forcing +inf - // for denormals for consistency with AVX and SSE implementations. - Packet16f y_approx = _mm512_rsqrt14_ps(_x); - - // Do a single step of Newton-Raphson iteration to improve the approximation. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five)); - - // Select the result of the Newton-Raphson step for positive finite arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. - return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx); -} -#else - -template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f); - return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x)); + return generic_rsqrt_newton_step::run(_x, _mm512_rsqrt14_ps(_x)); } #endif @@ -266,51 +205,28 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt) // prsqrt for double. #if EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); - - Packet8d neg_half = pmul(_x, p8d_minus_half); - - // Identity infinite, negative and denormal arguments. - __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ); - __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ); - __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask; - - // Compute an approximate result using the rsqrt intrinsic, forcing +inf - // for denormals for consistency with AVX and SSE implementations. -#if defined(EIGEN_VECTORIZE_AVX512ER) - Packet8d y_approx = _mm512_rsqrt28_pd(_x); -#else - Packet8d y_approx = _mm512_rsqrt14_pd(_x); -#endif - // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the - // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available). - // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number - // of correct digits for each step. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five)); -#if !defined(EIGEN_VECTORIZE_AVX512ER) - y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five)); -#endif - // Select the result of the Newton-Raphson step for positive finite arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. - return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx); + #ifdef EIGEN_VECTORIZE_AVX512ER + return generic_rsqrt_newton_step::run(_x, _mm512_rsqrt28_pd(_x)); + #else + return generic_rsqrt_newton_step::run(_x, _mm512_rsqrt14_pd(_x)); + #endif } + +template<> EIGEN_STRONG_INLINE Packet16f preciprocal(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512ER + return _mm512_rcp28_ps(a); #else -template <> -EIGEN_STRONG_INLINE Packet8d prsqrt(const Packet8d& x) { - _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f); - return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x)); + return generic_reciprocal_newton_step::run(a, _mm512_rcp14_ps(a)); +#endif } + +F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal) #endif -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f plog1p(const Packet16f& _x) { return generic_plog1p(_x); } @@ -318,7 +234,7 @@ Packet16f plog1p(const Packet16f& _x) { F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p) -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f pexpm1(const Packet16f& _x) { return generic_expm1(_x); } @@ -326,23 +242,47 @@ Packet16f pexpm1(const Packet16f& _x) { F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) -#endif +#endif // EIGEN_HAS_AVX512_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psin(const Packet16f& _x) { return psin_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f pcos(const Packet16f& _x) { return pcos_float(_x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f +pacos(const Packet16f& _x) { + return pacos_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f +pasin(const Packet16f& _x) { + return pasin_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f +patan(const Packet16f& _x) { + return patan_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d +patan(const Packet8d& _x) { + return patan_double(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f ptanh(const Packet16f& _x) { return internal::generic_fast_tanh_float(_x); } diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h index 34d49ab..159ae3e 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_AVX512_H #define EIGEN_PACKET_MATH_AVX512_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -28,10 +30,19 @@ namespace internal { #endif #endif +// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics. +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900 +#define EIGEN_HAS_AVX512_MATH 1 +#else +#define EIGEN_HAS_AVX512_MATH 0 +#endif + typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; +#ifndef EIGEN_VECTORIZE_AVX512FP16 typedef eigen_packet_wrapper<__m256i, 1> Packet16h; +#endif typedef eigen_packet_wrapper<__m256i, 2> Packet16bf; template <> @@ -47,6 +58,7 @@ struct is_arithmetic<__m512d> { enum { value = true }; }; +#ifndef EIGEN_VECTORIZE_AVX512FP16 template<> struct is_arithmetic { enum { value = true }; }; template <> @@ -72,12 +84,14 @@ struct packet_traits : default_packet_traits { HasMax = 1, HasConj = 1, HasSetLinear = 0, - HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, + HasLog = EIGEN_HAS_AVX512_MATH, + HasLog1p = EIGEN_HAS_AVX512_MATH, + HasExp = EIGEN_HAS_AVX512_MATH, + HasExpm1 = EIGEN_HAS_AVX512_MATH, + HasSqrt = EIGEN_HAS_AVX512_MATH, + HasRsqrt = EIGEN_HAS_AVX512_MATH, + HasBessel = EIGEN_HAS_AVX512_MATH, + HasNdtri = EIGEN_HAS_AVX512_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, @@ -86,11 +100,10 @@ struct packet_traits : default_packet_traits { HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, - HasBessel = 1, - HasNdtri = 1 + HasRint = 1 }; }; +#endif template<> struct packet_traits : default_packet_traits { @@ -109,7 +122,10 @@ template<> struct packet_traits : default_packet_traits HasBlend = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) + HasACos = 1, + HasASin = 1, + HasATan = 1, +#if EIGEN_HAS_AVX512_MATH HasLog = 1, HasLog1p = 1, HasExpm1 = 1, @@ -118,6 +134,7 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, + HasReciprocal = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, #endif @@ -138,12 +155,13 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH HasLog = 1, HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif + HasATan = 1, HasCmp = 1, HasDiv = 1, HasRound = 1, @@ -153,17 +171,18 @@ template<> struct packet_traits : default_packet_traits }; }; -/* TODO Implement AVX512 for integers -template<> struct packet_traits : default_packet_traits +template<> struct packet_traits : default_packet_traits { typedef Packet16i type; + typedef Packet8i half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=8 + HasCmp = 1, + HasDiv = 1, + size=16 }; }; -*/ template <> struct unpacket_traits { @@ -171,27 +190,30 @@ struct unpacket_traits { typedef Packet8f half; typedef Packet16i integer_packet; typedef uint16_t mask_t; - enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true }; + enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true, masked_fpops_available=true }; }; template <> struct unpacket_traits { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef uint8_t mask_t; + enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true, masked_fpops_available=true }; }; template <> struct unpacket_traits { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false }; + enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false }; }; +#ifndef EIGEN_VECTORIZE_AVX512FP16 template<> struct unpacket_traits { typedef Eigen::half type; typedef Packet8h half; enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; +#endif template <> EIGEN_STRONG_INLINE Packet16f pset1(const float& from) { @@ -235,11 +257,25 @@ template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) { template <> EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { +#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0) + // Inline asm here helps reduce some register spilling in TRSM kernels. + // See note in unrolls::gemm::microKernel in TrsmKernel.h + Packet16f ret; + __asm__ ("vbroadcastss %[mem], %[dst]" : [dst] "=v" (ret) : [mem] "m" (*from)); + return ret; +#else return _mm512_broadcastss_ps(_mm_load_ps1(from)); +#endif } template <> EIGEN_STRONG_INLINE Packet8d pload1(const double* from) { +#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0) + Packet8d ret; + __asm__ ("vbroadcastsd %[mem], %[dst]" : [dst] "=v" (ret) : [mem] "m" (*from)); + return ret; +#else return _mm512_set1_pd(*from); +#endif } template <> @@ -254,6 +290,12 @@ EIGEN_STRONG_INLINE Packet8d plset(const double& a) { return _mm512_add_pd(_mm512_set1_pd(a), _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); } +template <> +EIGEN_STRONG_INLINE Packet16i plset(const int& a) { + return _mm512_add_epi32( + _mm512_set1_epi32(a), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); +} template <> EIGEN_STRONG_INLINE Packet16f padd(const Packet16f& a, @@ -271,6 +313,21 @@ EIGEN_STRONG_INLINE Packet16i padd(const Packet16i& a, return _mm512_add_epi32(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16f padd(const Packet16f& a, + const Packet16f& b, + uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + return _mm512_maskz_add_ps(mask, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d padd(const Packet8d& a, + const Packet8d& b, + uint8_t umask) { + __mmask8 mask = static_cast<__mmask8>(umask); + return _mm512_maskz_add_pd(mask, a, b); +} + template <> EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, const Packet16f& b) { @@ -289,11 +346,17 @@ EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { - return _mm512_sub_ps(_mm512_set1_ps(0.0), a); + const __m512i mask = _mm512_set1_epi32(0x80000000); + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask)); } template <> EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { - return _mm512_sub_pd(_mm512_set1_pd(0.0), a); + const __m512i mask = _mm512_set1_epi64(0x8000000000000000ULL); + return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask)); +} +template <> +EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) { + return _mm512_sub_epi32(_mm512_set1_epi32(0), a); } template <> @@ -330,12 +393,21 @@ EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, const Packet16f& b) { return _mm512_div_ps(a, b); } + template <> EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, const Packet8d& b) { return _mm512_div_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i pdiv(const Packet16i& a, + const Packet16i& b) { + Packet8i q_lo = pdiv(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b,0)); + Packet8i q_hi = pdiv(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); + return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1); +} + #ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, @@ -347,6 +419,39 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) { return _mm512_fmadd_pd(a, b, c); } + +template <> +EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b, + const Packet16f& c) { + return _mm512_fmsub_ps(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b, + const Packet8d& c) { + return _mm512_fmsub_pd(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b, + const Packet16f& c) { + return _mm512_fnmadd_ps(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b, + const Packet8d& c) { + return _mm512_fnmadd_pd(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b, + const Packet16f& c) { + return _mm512_fnmsub_ps(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b, + const Packet8d& c) { + return _mm512_fnmsub_pd(a, b, c); +} #endif template <> @@ -379,6 +484,11 @@ EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, // Arguments are reversed to match NaN propagation behavior of std::min. return _mm512_min_pd(b, a); } +template <> +EIGEN_STRONG_INLINE Packet16i pmin(const Packet16i& a, + const Packet16i& b) { + return _mm512_min_epi32(b, a); +} template <> EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, @@ -392,6 +502,11 @@ EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, // Arguments are reversed to match NaN propagation behavior of std::max. return _mm512_max_pd(b, a); } +template <> +EIGEN_STRONG_INLINE Packet16i pmax(const Packet16i& a, + const Packet16i& b) { + return _mm512_max_epi32(b, a); +} // Add specializations for min/max with prescribed NaN progation. template<> @@ -493,10 +608,17 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, cons } template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); +} +template<> EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) { + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); +} +template<> EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) { + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT); return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } - template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { @@ -686,7 +808,7 @@ EIGEN_STRONG_INLINE Packet8d pload(const double* from) { template <> EIGEN_STRONG_INLINE Packet16i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( - reinterpret_cast(from)); + reinterpret_cast(from)); } template <> @@ -708,6 +830,11 @@ EIGEN_STRONG_INLINE Packet16f ploadu(const float* from, uint16_t umas __mmask16 mask = static_cast<__mmask16>(umask); EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from); } +template <> +EIGEN_STRONG_INLINE Packet8d ploadu(const double* from, uint8_t umask) { + __mmask8 mask = static_cast<__mmask8>(umask); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_pd(mask, from); +} // Loads 8 floats from memory a returns the packet // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} @@ -746,6 +873,16 @@ EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { } #endif +// Loads 8 integers from memory and returns the packet +// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} +template <> +EIGEN_STRONG_INLINE Packet16i ploaddup(const int* from) { + __m256i low_half = _mm256_loadu_si256(reinterpret_cast(from)); + __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half)); + __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); + return _mm512_castps_si512(pairs); +} + // Loads 4 floats from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} template <> @@ -766,6 +903,15 @@ EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { return _mm512_insertf64x4(tmp, lane1, 1); } +// Loads 4 integers from memory and returns the packet +// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} +template <> +EIGEN_STRONG_INLINE Packet16i ploadquad(const int* from) { + Packet16i tmp = _mm512_castsi128_si512(ploadu(from)); + const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); + return _mm512_permutexvar_epi32(scatter_mask, tmp); +} + template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet16f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from); @@ -798,6 +944,40 @@ EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from, uint16 __mmask16 mask = static_cast<__mmask16>(umask); EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from); } +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet8d& from, uint8_t umask) { + __mmask8 mask = static_cast<__mmask8>(umask); + EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_pd(to, mask, from); +} + +template +EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from, + Index stride, typename unpacket_traits::mask_t umask); +template <> +EIGEN_DEVICE_FUNC inline Packet16f pgather(const Packet16f& src, + const float* from, + Index stride, + uint16_t umask) { + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + __mmask16 mask = static_cast<__mmask16>(umask); + + return _mm512_mask_i32gather_ps(src, mask, indices, from, 4); +} +template <> +EIGEN_DEVICE_FUNC inline Packet8d pgather(const Packet8d& src, + const double* from, + Index stride, + uint8_t umask) { + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); + Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); + __mmask8 mask = static_cast<__mmask8>(umask); + + return _mm512_mask_i32gather_pd(src, mask, indices, from, 8); +} template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, @@ -818,6 +998,42 @@ EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, return _mm512_i32gather_pd(indices, from, 8); } +template <> +EIGEN_DEVICE_FUNC inline Packet16i pgather(const int* from, + Index stride) { + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + return _mm512_i32gather_epi32(indices, from, 4); +} + +template +EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, + Index stride, typename unpacket_traits::mask_t umask); +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, + const Packet16f& from, + Index stride, + uint16_t umask) { + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + __mmask16 mask = static_cast<__mmask16>(umask); + _mm512_mask_i32scatter_ps(to, mask, indices, from, 4); +} +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, + const Packet8d& from, + Index stride, + uint8_t umask) { + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); + Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); + __mmask8 mask = static_cast<__mmask8>(umask); + _mm512_mask_i32scatter_pd(to, mask, indices, from, 8); +} template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, @@ -838,6 +1054,16 @@ EIGEN_DEVICE_FUNC inline void pscatter(double* to, Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); _mm512_i32scatter_pd(to, indices, from, 8); } +template <> +EIGEN_DEVICE_FUNC inline void pscatter(int* to, + const Packet16i& from, + Index stride) { + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + _mm512_i32scatter_epi32(to, indices, from, 4); +} template <> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { @@ -882,6 +1108,11 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a); } +template<> EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a) +{ + return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a); +} + template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) { // _mm512_abs_ps intrinsic not found, so hack around it @@ -893,6 +1124,15 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff))); } +template<> EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a) +{ + return _mm512_abs_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) { return _mm256_srai_epi16(a, 15); } +template<> EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) { return _mm256_srai_epi16(a, 15); } +template<> EIGEN_STRONG_INLINE Packet16f psignbit(const Packet16f& a) { return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31)); } +template<> EIGEN_STRONG_INLINE Packet8d psignbit(const Packet8d& a) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63)); } template<> EIGEN_STRONG_INLINE Packet16f pfrexp(const Packet16f& a, Packet16f& exponent){ @@ -901,7 +1141,7 @@ EIGEN_STRONG_INLINE Packet16f pfrexp(const Packet16f& a, Packet16f& e // Extract exponent without existence of Packet8l. template<> -EIGEN_STRONG_INLINE +EIGEN_STRONG_INLINE Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) { const Packet8d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); #ifdef EIGEN_VECTORIZE_AVX512DQ @@ -924,11 +1164,11 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons // Clamp exponent to [-2099, 2099] const Packet8d max_exponent = pset1(2099.0); const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); - + // Split 2^e into four factors and multiply. const Packet8i bias = pset1(1023); Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4) - + // 2^b const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); @@ -936,7 +1176,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); Packet8d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) - + // 2^(e - 3b) b = psub(psub(psub(e, b), b), b); // e - 3b hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); @@ -952,6 +1192,11 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \ __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1) + +// AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i +#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \ + __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \ + __m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1) #else #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ @@ -959,12 +1204,23 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons _mm512_extractf32x4_ps(INPUT, 1), 1); \ __m256 OUTPUT##_1 = _mm256_insertf128_ps( \ _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \ - _mm512_extractf32x4_ps(INPUT, 3), 1); + _mm512_extractf32x4_ps(INPUT, 3), 1) + +#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \ + __m256i OUTPUT##_0 = _mm256_insertf128_si256( \ + _mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \ + _mm512_extracti32x4_epi32(INPUT, 1), 1); \ + __m256i OUTPUT##_1 = _mm256_insertf128_si256( \ + _mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \ + _mm512_extracti32x4_epi32(INPUT, 3), 1) #endif #ifdef EIGEN_VECTORIZE_AVX512DQ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); + +#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_inserti32x8(_mm512_castsi256_si512(INPUTA), INPUTB, 1); #else #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ OUTPUT = _mm512_undefined_ps(); \ @@ -972,6 +1228,13 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); + +#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_undefined_epi32(); \ + OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \ + OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \ + OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 0), 2); \ + OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3); #endif template <> @@ -1000,6 +1263,24 @@ EIGEN_STRONG_INLINE double predux(const Packet8d& a) { __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0))); } +template <> +EIGEN_STRONG_INLINE int predux(const Packet16i& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256i lane0 = _mm512_extracti32x8_epi32(a, 0); + __m256i lane1 = _mm512_extracti32x8_epi32(a, 1); + Packet8i x = _mm256_add_epi32(lane0, lane1); + return predux(x); +#else + __m128i lane0 = _mm512_extracti32x4_epi32(a, 0); + __m128i lane1 = _mm512_extracti32x4_epi32(a, 1); + __m128i lane2 = _mm512_extracti32x4_epi32(a, 2); + __m128i lane3 = _mm512_extracti32x4_epi32(a, 3); + __m128i sum = _mm_add_epi32(_mm_add_epi32(lane0, lane1), _mm_add_epi32(lane2, lane3)); + sum = _mm_hadd_epi32(sum, sum); + sum = _mm_hadd_epi32(sum, _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(sum), 1))); + return _mm_cvtsi128_si32(sum); +#endif +} template <> EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { @@ -1023,6 +1304,22 @@ EIGEN_STRONG_INLINE Packet4d predux_half_dowto4(const Packet8d& a) { __m256d lane1 = _mm512_extractf64x4_pd(a, 1); return _mm256_add_pd(lane0, lane1); } +template <> +EIGEN_STRONG_INLINE Packet8i predux_half_dowto4(const Packet16i& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256i lane0 = _mm512_extracti32x8_epi32(a, 0); + __m256i lane1 = _mm512_extracti32x8_epi32(a, 1); + return _mm256_add_epi32(lane0, lane1); +#else + __m128i lane0 = _mm512_extracti32x4_epi32(a, 0); + __m128i lane1 = _mm512_extracti32x4_epi32(a, 1); + __m128i lane2 = _mm512_extracti32x4_epi32(a, 2); + __m128i lane3 = _mm512_extracti32x4_epi32(a, 3); + __m128i sum0 = _mm_add_epi32(lane0, lane2); + __m128i sum1 = _mm_add_epi32(lane1, lane3); + return _mm256_inserti128_si256(_mm256_castsi128_si256(sum0), sum1, 1); +#endif +} template <> EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { @@ -1099,7 +1396,11 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) return !_mm512_kortestz(tmp,tmp); } - +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x) +{ + __mmask16 tmp = _mm512_test_epi32_mask(x,x); + return !_mm512_kortestz(tmp,tmp); +} #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); @@ -1219,6 +1520,44 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \ INPUT[2 * INDEX + STRIDE]); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0],kernel.packet[1]); + __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0],kernel.packet[1]); + __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2],kernel.packet[3]); + __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2],kernel.packet[3]); + __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4],kernel.packet[5]); + __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4],kernel.packet[5]); + __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6],kernel.packet[7]); + __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6],kernel.packet[7]); + + kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0),_mm512_castps_pd(T2))); + kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0),_mm512_castps_pd(T2))); + kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1),_mm512_castps_pd(T3))); + kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1),_mm512_castps_pd(T3))); + kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4),_mm512_castps_pd(T6))); + kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4),_mm512_castps_pd(T6))); + kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7))); + kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7))); + + T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44); + T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee); + T2 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0x44); + T3 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0xee); + T4 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0x44); + T5 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0xee); + T6 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0x44); + T7 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0xee); + + kernel.packet[0] = _mm512_shuffle_f32x4(T0, T2, 0x88); + kernel.packet[2] = _mm512_shuffle_f32x4(T0, T2, 0xdd); + kernel.packet[1] = _mm512_shuffle_f32x4(T4, T6, 0x88); + kernel.packet[3] = _mm512_shuffle_f32x4(T4, T6, 0xdd); + kernel.packet[4] = _mm512_shuffle_f32x4(T1, T3, 0x88); + kernel.packet[6] = _mm512_shuffle_f32x4(T1, T3, 0xdd); + kernel.packet[5] = _mm512_shuffle_f32x4(T5, T7, 0x88); + kernel.packet[7] = _mm512_shuffle_f32x4(T5, T7, 0xdd); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); @@ -1295,68 +1634,216 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]); - __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]); - __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]); - __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]); - __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]); - __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]); - __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]); - __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]); + __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0],kernel.packet[1]); + __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0],kernel.packet[1]); + __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2],kernel.packet[3]); + __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2],kernel.packet[3]); + __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4],kernel.packet[5]); + __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4],kernel.packet[5]); + __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6],kernel.packet[7]); + __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6],kernel.packet[7]); - PacketBlock tmp; + kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E); + kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]); + kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E); + kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2); + kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E); + kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]); + kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E); + kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3); + kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E); + kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]); + kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E); + kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6); + kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E); + kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]); + kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E); + kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7); - tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), - _mm512_extractf64x4_pd(T2, 0), 0x20); - tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), - _mm512_extractf64x4_pd(T3, 0), 0x20); - tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), - _mm512_extractf64x4_pd(T2, 0), 0x31); - tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), - _mm512_extractf64x4_pd(T3, 0), 0x31); + T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E); + T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0); + T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E); + T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]); + T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E); + T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1); + T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E); + T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]); + T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E); + T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2); + T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E); + T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]); + T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E); + T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3); + T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E); + T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]); - tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), - _mm512_extractf64x4_pd(T2, 1), 0x20); - tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), - _mm512_extractf64x4_pd(T3, 1), 0x20); - tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), - _mm512_extractf64x4_pd(T2, 1), 0x31); - tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), - _mm512_extractf64x4_pd(T3, 1), 0x31); - - tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), - _mm512_extractf64x4_pd(T6, 0), 0x20); - tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), - _mm512_extractf64x4_pd(T7, 0), 0x20); - tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), - _mm512_extractf64x4_pd(T6, 0), 0x31); - tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), - _mm512_extractf64x4_pd(T7, 0), 0x31); - - tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), - _mm512_extractf64x4_pd(T6, 1), 0x20); - tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), - _mm512_extractf64x4_pd(T7, 1), 0x20); - tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), - _mm512_extractf64x4_pd(T6, 1), 0x31); - tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), - _mm512_extractf64x4_pd(T7, 1), 0x31); - - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8); - - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8); - PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8); + kernel.packet[0] = T0; kernel.packet[1] = T1; + kernel.packet[2] = T2; kernel.packet[3] = T3; + kernel.packet[4] = T4; kernel.packet[5] = T5; + kernel.packet[6] = T6; kernel.packet[7] = T7; } + +#define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \ + EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); + +#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \ + EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], \ + INPUT[2 * INDEX + STRIDE]); + +#define SHUFFLE_EPI32(A, B, M) \ + _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M)) + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + __m512i T4 = _mm512_unpacklo_epi32(kernel.packet[4], kernel.packet[5]); + __m512i T5 = _mm512_unpackhi_epi32(kernel.packet[4], kernel.packet[5]); + __m512i T6 = _mm512_unpacklo_epi32(kernel.packet[6], kernel.packet[7]); + __m512i T7 = _mm512_unpackhi_epi32(kernel.packet[6], kernel.packet[7]); + __m512i T8 = _mm512_unpacklo_epi32(kernel.packet[8], kernel.packet[9]); + __m512i T9 = _mm512_unpackhi_epi32(kernel.packet[8], kernel.packet[9]); + __m512i T10 = _mm512_unpacklo_epi32(kernel.packet[10], kernel.packet[11]); + __m512i T11 = _mm512_unpackhi_epi32(kernel.packet[10], kernel.packet[11]); + __m512i T12 = _mm512_unpacklo_epi32(kernel.packet[12], kernel.packet[13]); + __m512i T13 = _mm512_unpackhi_epi32(kernel.packet[12], kernel.packet[13]); + __m512i T14 = _mm512_unpacklo_epi32(kernel.packet[14], kernel.packet[15]); + __m512i T15 = _mm512_unpackhi_epi32(kernel.packet[14], kernel.packet[15]); + __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S4 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S5 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S6 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S7 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S8 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S9 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S10 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S11 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S12 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S13 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S14 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S15 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(3, 2, 3, 2)); + + EIGEN_EXTRACT_8i_FROM_16i(S0, S0); + EIGEN_EXTRACT_8i_FROM_16i(S1, S1); + EIGEN_EXTRACT_8i_FROM_16i(S2, S2); + EIGEN_EXTRACT_8i_FROM_16i(S3, S3); + EIGEN_EXTRACT_8i_FROM_16i(S4, S4); + EIGEN_EXTRACT_8i_FROM_16i(S5, S5); + EIGEN_EXTRACT_8i_FROM_16i(S6, S6); + EIGEN_EXTRACT_8i_FROM_16i(S7, S7); + EIGEN_EXTRACT_8i_FROM_16i(S8, S8); + EIGEN_EXTRACT_8i_FROM_16i(S9, S9); + EIGEN_EXTRACT_8i_FROM_16i(S10, S10); + EIGEN_EXTRACT_8i_FROM_16i(S11, S11); + EIGEN_EXTRACT_8i_FROM_16i(S12, S12); + EIGEN_EXTRACT_8i_FROM_16i(S13, S13); + EIGEN_EXTRACT_8i_FROM_16i(S14, S14); + EIGEN_EXTRACT_8i_FROM_16i(S15, S15); + + PacketBlock tmp; + + tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S4_0, 0x20); + tmp.packet[1] = _mm256_permute2f128_si256(S1_0, S5_0, 0x20); + tmp.packet[2] = _mm256_permute2f128_si256(S2_0, S6_0, 0x20); + tmp.packet[3] = _mm256_permute2f128_si256(S3_0, S7_0, 0x20); + tmp.packet[4] = _mm256_permute2f128_si256(S0_0, S4_0, 0x31); + tmp.packet[5] = _mm256_permute2f128_si256(S1_0, S5_0, 0x31); + tmp.packet[6] = _mm256_permute2f128_si256(S2_0, S6_0, 0x31); + tmp.packet[7] = _mm256_permute2f128_si256(S3_0, S7_0, 0x31); + + tmp.packet[8] = _mm256_permute2f128_si256(S0_1, S4_1, 0x20); + tmp.packet[9] = _mm256_permute2f128_si256(S1_1, S5_1, 0x20); + tmp.packet[10] = _mm256_permute2f128_si256(S2_1, S6_1, 0x20); + tmp.packet[11] = _mm256_permute2f128_si256(S3_1, S7_1, 0x20); + tmp.packet[12] = _mm256_permute2f128_si256(S0_1, S4_1, 0x31); + tmp.packet[13] = _mm256_permute2f128_si256(S1_1, S5_1, 0x31); + tmp.packet[14] = _mm256_permute2f128_si256(S2_1, S6_1, 0x31); + tmp.packet[15] = _mm256_permute2f128_si256(S3_1, S7_1, 0x31); + + // Second set of _m256 outputs + tmp.packet[16] = _mm256_permute2f128_si256(S8_0, S12_0, 0x20); + tmp.packet[17] = _mm256_permute2f128_si256(S9_0, S13_0, 0x20); + tmp.packet[18] = _mm256_permute2f128_si256(S10_0, S14_0, 0x20); + tmp.packet[19] = _mm256_permute2f128_si256(S11_0, S15_0, 0x20); + tmp.packet[20] = _mm256_permute2f128_si256(S8_0, S12_0, 0x31); + tmp.packet[21] = _mm256_permute2f128_si256(S9_0, S13_0, 0x31); + tmp.packet[22] = _mm256_permute2f128_si256(S10_0, S14_0, 0x31); + tmp.packet[23] = _mm256_permute2f128_si256(S11_0, S15_0, 0x31); + + tmp.packet[24] = _mm256_permute2f128_si256(S8_1, S12_1, 0x20); + tmp.packet[25] = _mm256_permute2f128_si256(S9_1, S13_1, 0x20); + tmp.packet[26] = _mm256_permute2f128_si256(S10_1, S14_1, 0x20); + tmp.packet[27] = _mm256_permute2f128_si256(S11_1, S15_1, 0x20); + tmp.packet[28] = _mm256_permute2f128_si256(S8_1, S12_1, 0x31); + tmp.packet[29] = _mm256_permute2f128_si256(S9_1, S13_1, 0x31); + tmp.packet[30] = _mm256_permute2f128_si256(S10_1, S14_1, 0x31); + tmp.packet[31] = _mm256_permute2f128_si256(S11_1, S15_1, 0x31); + + // Pack them into the output + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 0, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 1, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 2, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 3, 16); + + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 4, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 5, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 6, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 7, 16); + + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 8, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 9, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 10, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 11, 16); + + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 12, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 13, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 14, 16); + PACK_OUTPUT_I32(kernel.packet, tmp.packet, 15, 16); +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + + __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); + + EIGEN_EXTRACT_8i_FROM_16i(S0, S0); + EIGEN_EXTRACT_8i_FROM_16i(S1, S1); + EIGEN_EXTRACT_8i_FROM_16i(S2, S2); + EIGEN_EXTRACT_8i_FROM_16i(S3, S3); + + PacketBlock tmp; + + tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S1_0, 0x20); + tmp.packet[1] = _mm256_permute2f128_si256(S2_0, S3_0, 0x20); + tmp.packet[2] = _mm256_permute2f128_si256(S0_0, S1_0, 0x31); + tmp.packet[3] = _mm256_permute2f128_si256(S2_0, S3_0, 0x31); + + tmp.packet[4] = _mm256_permute2f128_si256(S0_1, S1_1, 0x20); + tmp.packet[5] = _mm256_permute2f128_si256(S2_1, S3_1, 0x20); + tmp.packet[6] = _mm256_permute2f128_si256(S0_1, S1_1, 0x31); + tmp.packet[7] = _mm256_permute2f128_si256(S2_1, S3_1, 0x31); + + PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 0, 1); + PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 1, 1); + PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 2, 1); + PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1); +} + template <> EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, const Packet16f& /*thenPacket*/, const Packet16f& /*elsePacket*/) { - assert(false && "To be implemented"); + eigen_assert(false && "To be implemented"); return Packet16f(); } template <> @@ -1426,64 +1913,15 @@ ploadquad(const Eigen::half* from) { } EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtph_ps(a); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif } EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - return _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); -#endif } template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { - return ptrue(Packet8i(a)); + return Packet16h(ptrue(Packet8i(a))); } template <> @@ -1512,16 +1950,16 @@ EIGEN_STRONG_INLINE Packet16h plset(const half& a) { template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. - return por(Packet8i(a),Packet8i(b)); + return Packet16h(por(Packet8i(a),Packet8i(b))); } template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { - return pxor(Packet8i(a),Packet8i(b)); + return Packet16h(pxor(Packet8i(a),Packet8i(b))); } template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { - return pand(Packet8i(a),Packet8i(b)); + return Packet16h(pand(Packet8i(a),Packet8i(b))); } template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { - return pandnot(Packet8i(a),Packet8i(b)); + return Packet16h(pandnot(Packet8i(a),Packet8i(b))); } template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) { @@ -1569,6 +2007,7 @@ template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { return _mm256_xor_si256(a, sign_mask); } +#ifndef EIGEN_VECTORIZE_AVX512FP16 template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); Packet16f bf = half2float(b); @@ -1602,6 +2041,8 @@ template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { return half(predux(from_float)); } +#endif + template <> EIGEN_STRONG_INLINE Packet8h predux_half_dowto4(const Packet16h& a) { Packet8h lane0 = _mm256_extractf128_si256(a, 0); @@ -1852,7 +2293,7 @@ struct packet_traits : default_packet_traits { HasInsert = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, // Currently fails test with bad accuracy. HasLog1p = 1, @@ -1915,7 +2356,6 @@ EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, template<> EIGEN_STRONG_INLINE Packet16bf ploaddup(const bfloat16* from) { - Packet16bf r; unsigned short a = from[0].value; unsigned short b = from[1].value; unsigned short c = from[2].value; @@ -1929,7 +2369,6 @@ ploaddup(const bfloat16* from) { template<> EIGEN_STRONG_INLINE Packet16bf ploadquad(const bfloat16* from) { - Packet16bf r; unsigned short a = from[0].value; unsigned short b = from[1].value; unsigned short c = from[2].value; @@ -1947,7 +2386,7 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1) // Since GCC 10.1 supports avx512bf16 and C style explicit cast - // (C++ static_cast is not supported yet), do converion via intrinsic + // (C++ static_cast is not supported yet), do conversion via intrinsic // and register path for performance. r = (__m256i)(_mm512_cvtneps_pbh(a)); @@ -1978,28 +2417,28 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { template <> EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) { - return ptrue(a); + return Packet16bf(ptrue(Packet8i(a))); } template <> EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) { - return por(a, b); + return Packet16bf(por(Packet8i(a), Packet8i(b))); } template <> EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) { - return pxor(a, b); + return Packet16bf(pxor(Packet8i(a), Packet8i(b))); } template <> EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) { - return pand(a, b); + return Packet16bf(pand(Packet8i(a), Packet8i(b))); } template <> EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, const Packet16bf& b) { - return pandnot(a, b); + return Packet16bf(pandnot(Packet8i(a), Packet8i(b))); } template <> diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/libs/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h new file mode 100644 index 0000000..13f285e --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h @@ -0,0 +1,877 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H +#define EIGEN_PACKET_MATH_FP16_AVX512_H + +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics. +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900 +#define EIGEN_HAS_AVX512_MATH 1 +#else +#define EIGEN_HAS_AVX512_MATH 0 +#endif + +typedef __m512h Packet32h; +typedef eigen_packet_wrapper<__m256i, 1> Packet16h; +typedef eigen_packet_wrapper<__m128i, 2> Packet8h; + +template <> +struct is_arithmetic { + enum { value = true }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet32h type; + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + HasHalfPacket = 1, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + // These ones should be implemented in future + HasLog = EIGEN_HAS_AVX512_MATH, + HasLog1p = EIGEN_HAS_AVX512_MATH, + HasExp = EIGEN_HAS_AVX512_MATH, + HasExpm1 = EIGEN_HAS_AVX512_MATH, + HasSqrt = EIGEN_HAS_AVX512_MATH, + HasRsqrt = EIGEN_HAS_AVX512_MATH, + HasBessel = 0, // EIGEN_HAS_AVX512_MATH, + HasNdtri = 0, // EIGEN_HAS_AVX512_MATH, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0, // EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet16h half; + enum { + size = 32, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet8h half; + enum { + size = 16, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet8h half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +// Memory functions + +// pset1 + +template <> +EIGEN_STRONG_INLINE Packet32h pset1(const Eigen::half& from) { + return _mm512_set1_ph(static_cast<_Float16>(from)); +} + +// pset1frombits +template <> +EIGEN_STRONG_INLINE Packet32h pset1frombits(unsigned short from) { + return _mm512_castsi512_ph(_mm512_set1_epi16(from)); +} + +// pfirst + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet32h& from) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return half_impl::raw_uint16_to_half( + static_cast(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0))); +#else + Eigen::half dest[32]; + _mm512_storeu_ph(dest, from); + return dest[0]; +#endif +} + +// pload + +template <> +EIGEN_STRONG_INLINE Packet32h pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from); +} + +// ploadu + +template <> +EIGEN_STRONG_INLINE Packet32h ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from); +} + +// pstore + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet32h& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from); +} + +// pstoreu + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet32h& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from); +} + +// ploaddup +template <> +EIGEN_STRONG_INLINE Packet32h ploaddup(const Eigen::half* from) { + __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from)); + return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, + 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), + a); +} + +// ploadquad +template <> +EIGEN_STRONG_INLINE Packet32h ploadquad(const Eigen::half* from) { + __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from)); + return _mm512_permutexvar_ph( + _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0), + a); +} + +// pabs + +template <> +EIGEN_STRONG_INLINE Packet32h pabs(const Packet32h& a) { + return _mm512_abs_ph(a); +} + +// psignbit + +template <> +EIGEN_STRONG_INLINE Packet32h psignbit(const Packet32h& a) { + return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15)); +} + +// pmin + +template <> +EIGEN_STRONG_INLINE Packet32h pmin(const Packet32h& a, const Packet32h& b) { + return _mm512_min_ph(a, b); +} + +// pmax + +template <> +EIGEN_STRONG_INLINE Packet32h pmax(const Packet32h& a, const Packet32h& b) { + return _mm512_max_ph(a, b); +} + +// plset +template <> +EIGEN_STRONG_INLINE Packet32h plset(const half& a) { + return _mm512_add_ph(_mm512_set1_ph(a), + _mm512_set_ph(31.0f, 30.0f, 29.0f, 28.0f, 27.0f, 26.0f, 25.0f, 24.0f, 23.0f, 22.0f, 21.0f, 20.0f, + 19.0f, 18.0f, 17.0f, 16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, + 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +} + +// por + +template <> +EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); +} + +// pxor + +template <> +EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); +} + +// pand + +template <> +EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b))); +} + +// pandnot + +template <> +EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) { + return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a))); +} + +// pselect + +template <> +EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) { + __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ); + return _mm512_mask_blend_ph(mask32, a, b); +} + +// pcmp_eq + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); +} + +// pcmp_le + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); +} + +// pcmp_lt + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, 0xffffu)); +} + +// pcmp_lt_or_nan + +template <> +EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) { + __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ); + return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, 0xffffu)); +} + +// padd + +template <> +EIGEN_STRONG_INLINE Packet32h padd(const Packet32h& a, const Packet32h& b) { + return _mm512_add_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// psub + +template <> +EIGEN_STRONG_INLINE Packet32h psub(const Packet32h& a, const Packet32h& b) { + return _mm512_sub_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// pmul + +template <> +EIGEN_STRONG_INLINE Packet32h pmul(const Packet32h& a, const Packet32h& b) { + return _mm512_mul_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// pdiv + +template <> +EIGEN_STRONG_INLINE Packet32h pdiv(const Packet32h& a, const Packet32h& b) { + return _mm512_div_ph(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { + return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { + return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b))); +} + +// pround + +template <> +EIGEN_STRONG_INLINE Packet32h pround(const Packet32h& a) { + // Work-around for default std::round rounding mode. + + // Mask for the sign bit + const Packet32h signMask = pset1frombits(static_cast(0x8000u)); + // The largest half-preicision float less than 0.5 + const Packet32h prev0dot5 = pset1frombits(static_cast(0x37FFu)); + + return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +// print + +template <> +EIGEN_STRONG_INLINE Packet32h print(const Packet32h& a) { + return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION); +} + +// pceil + +template <> +EIGEN_STRONG_INLINE Packet32h pceil(const Packet32h& a) { + return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF); +} + +// pfloor + +template <> +EIGEN_STRONG_INLINE Packet32h pfloor(const Packet32h& a) { + return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF); +} + +// predux +template <> +EIGEN_STRONG_INLINE half predux(const Packet32h& a) { + return (half)_mm512_reduce_add_ph(a); +} + +template <> +EIGEN_STRONG_INLINE half predux(const Packet16h& a) { + return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a)); +} + +template <> +EIGEN_STRONG_INLINE half predux(const Packet8h& a) { + return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a)); +} + +// predux_half_dowto4 +template <> +EIGEN_STRONG_INLINE Packet16h predux_half_dowto4(const Packet32h& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0)); + __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1)); + + return Packet16h(padd(lowHalf, highHalf)); +#else + Eigen::half data[32]; + _mm512_storeu_ph(data, a); + + __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data)); + __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16)); + + return Packet16h(padd(lowHalf, highHalf)); +#endif +} + +// predux_max + +// predux_min + +// predux_mul + +#ifdef EIGEN_VECTORIZE_FMA + +// pmadd + +template <> +EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fmadd_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +// pmsub + +template <> +EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fmsub_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +// pnmadd + +template <> +EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fnmadd_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +// pnmsub + +template <> +EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) { + return _mm512_fnmsub_ph(a, b, c); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) { + return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) { + return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c))); +} + +#endif + +// pnegate + +template <> +EIGEN_STRONG_INLINE Packet32h pnegate(const Packet32h& a) { + return _mm512_sub_ph(_mm512_set1_ph(0.0), a); +} + +// pconj + +template <> +EIGEN_STRONG_INLINE Packet32h pconj(const Packet32h& a) { + return a; +} + +// psqrt + +template <> +EIGEN_STRONG_INLINE Packet32h psqrt(const Packet32h& a) { + return _mm512_sqrt_ph(a); +} + +// prsqrt + +template <> +EIGEN_STRONG_INLINE Packet32h prsqrt(const Packet32h& a) { + return _mm512_rsqrt_ph(a); +} + +// preciprocal + +template <> +EIGEN_STRONG_INLINE Packet32h preciprocal(const Packet32h& a) { + return _mm512_rcp_ph(a); +} + +// ptranspose + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& a) { + __m512i t[32]; + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 16; i++) { + t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1])); + t[2 * i + 1] = + _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1])); + } + + __m512i p[32]; + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 8; i++) { + p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]); + p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]); + p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]); + p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]); + } + + __m512i q[32]; + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 4; i++) { + q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]); + q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]); + q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]); + q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]); + q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]); + q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]); + q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]); + q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]); + } + + __m512i f[32]; + +#define PACKET32H_TRANSPOSE_HELPER(X, Y) \ + do { \ + f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \ + f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \ + f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \ + f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \ + f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \ + f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \ + f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \ + f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \ + } while (false); + + PACKET32H_TRANSPOSE_HELPER(0, 0); + PACKET32H_TRANSPOSE_HELPER(1, 1); + PACKET32H_TRANSPOSE_HELPER(2, 2); + PACKET32H_TRANSPOSE_HELPER(3, 3); + + PACKET32H_TRANSPOSE_HELPER(1, 0); + PACKET32H_TRANSPOSE_HELPER(2, 0); + PACKET32H_TRANSPOSE_HELPER(3, 0); + PACKET32H_TRANSPOSE_HELPER(2, 1); + PACKET32H_TRANSPOSE_HELPER(3, 1); + PACKET32H_TRANSPOSE_HELPER(3, 2); + + PACKET32H_TRANSPOSE_HELPER(0, 1); + PACKET32H_TRANSPOSE_HELPER(0, 2); + PACKET32H_TRANSPOSE_HELPER(0, 3); + PACKET32H_TRANSPOSE_HELPER(1, 2); + PACKET32H_TRANSPOSE_HELPER(1, 3); + PACKET32H_TRANSPOSE_HELPER(2, 3); + +#undef PACKET32H_TRANSPOSE_HELPER + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 32; i++) { + a.packet[i] = _mm512_castsi512_ph(f[i]); + } +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& a) { + __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3; + t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1])); + t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1])); + t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3])); + t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3])); + + p0 = _mm512_unpacklo_epi32(t0, t2); + p1 = _mm512_unpackhi_epi32(t0, t2); + p2 = _mm512_unpacklo_epi32(t1, t3); + p3 = _mm512_unpackhi_epi32(t1, t3); + + a0 = p0; + a1 = p1; + a2 = p2; + a3 = p3; + + a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1); + a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0); + + a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2); + a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0); + + a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3); + a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0); + + a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2); + a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1); + + a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3); + a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2); + + a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3); + a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1); + + a.packet[0] = _mm512_castsi512_ph(a0); + a.packet[1] = _mm512_castsi512_ph(a1); + a.packet[2] = _mm512_castsi512_ph(a2); + a.packet[3] = _mm512_castsi512_ph(a3); +} + +// preverse + +template <> +EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) { + return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + a); +} + +// pscatter + +template <> +EIGEN_STRONG_INLINE void pscatter(half* to, const Packet32h& from, Index stride) { + EIGEN_ALIGN64 half aux[32]; + pstore(aux, from); + + EIGEN_UNROLL_LOOP + for (int i = 0; i < 32; i++) { + to[stride * i] = aux[i]; + } +} + +// pgather + +template <> +EIGEN_STRONG_INLINE Packet32h pgather(const Eigen::half* from, Index stride) { + return _mm512_castsi512_ph(_mm512_set_epi16( + from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x, + from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x, + from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, + from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x, + from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x, + from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x, + from[1 * stride].x, from[0 * stride].x)); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pcos(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h psin(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h plog(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h plog2(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h plog1p(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pexp(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pexpm1(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h ptanh(const Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h&, Packet16h&); +template <> +EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h&, const Packet16h&); + +EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) { + __m512d result = _mm512_undefined_pd(); + result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0); + result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1); + return _mm512_castpd_ph(result); +} + +EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) { + a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0)); + b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1)); +} + +// psin +template <> +EIGEN_STRONG_INLINE Packet32h psin(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = psin(low); + Packet16h highOut = psin(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pcos +template <> +EIGEN_STRONG_INLINE Packet32h pcos(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = pcos(low); + Packet16h highOut = pcos(high); + + return combine2Packet16h(lowOut, highOut); +} + +// plog +template <> +EIGEN_STRONG_INLINE Packet32h plog(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = plog(low); + Packet16h highOut = plog(high); + + return combine2Packet16h(lowOut, highOut); +} + +// plog2 +template <> +EIGEN_STRONG_INLINE Packet32h plog2(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = plog2(low); + Packet16h highOut = plog2(high); + + return combine2Packet16h(lowOut, highOut); +} + +// plog1p +template <> +EIGEN_STRONG_INLINE Packet32h plog1p(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = plog1p(low); + Packet16h highOut = plog1p(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pexp +template <> +EIGEN_STRONG_INLINE Packet32h pexp(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = pexp(low); + Packet16h highOut = pexp(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pexpm1 +template <> +EIGEN_STRONG_INLINE Packet32h pexpm1(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = pexpm1(low); + Packet16h highOut = pexpm1(high); + + return combine2Packet16h(lowOut, highOut); +} + +// ptanh +template <> +EIGEN_STRONG_INLINE Packet32h ptanh(const Packet32h& a) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h lowOut = ptanh(low); + Packet16h highOut = ptanh(high); + + return combine2Packet16h(lowOut, highOut); +} + +// pfrexp +template <> +EIGEN_STRONG_INLINE Packet32h pfrexp(const Packet32h& a, Packet32h& exponent) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h exp1 = _mm256_undefined_si256(); + Packet16h exp2 = _mm256_undefined_si256(); + + Packet16h lowOut = pfrexp(low, exp1); + Packet16h highOut = pfrexp(high, exp2); + + exponent = combine2Packet16h(exp1, exp2); + + return combine2Packet16h(lowOut, highOut); +} + +// pldexp +template <> +EIGEN_STRONG_INLINE Packet32h pldexp(const Packet32h& a, const Packet32h& exponent) { + Packet16h low; + Packet16h high; + extract2Packet16h(a, low, high); + + Packet16h exp1; + Packet16h exp2; + extract2Packet16h(exponent, exp1, exp2); + + Packet16h lowOut = pldexp(low, exp1); + Packet16h highOut = pldexp(high, exp2); + + return combine2Packet16h(lowOut, highOut); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_FP16_AVX512_H \ No newline at end of file diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/libs/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h new file mode 100644 index 0000000..edd6ef3 --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h @@ -0,0 +1,1185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2022 Intel Corporation +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H +#define EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H + +#include "../../InternalHeaderCheck.h" + +#if !defined(EIGEN_USE_AVX512_TRSM_KERNELS) +#define EIGEN_USE_AVX512_TRSM_KERNELS 1 +#endif + +#if EIGEN_USE_AVX512_TRSM_KERNELS +#if !defined(EIGEN_USE_AVX512_TRSM_R_KERNELS) +#define EIGEN_USE_AVX512_TRSM_R_KERNELS 1 +#endif +#if !defined(EIGEN_USE_AVX512_TRSM_L_KERNELS) +#define EIGEN_USE_AVX512_TRSM_L_KERNELS 1 +#endif +#else // EIGEN_USE_AVX512_TRSM_KERNELS == 0 +#define EIGEN_USE_AVX512_TRSM_R_KERNELS 0 +#define EIGEN_USE_AVX512_TRSM_L_KERNELS 0 +#endif + +// Need this for some std::min calls. +#ifdef min +#undef min +#endif + +namespace Eigen { +namespace internal { + +#define EIGEN_AVX_MAX_NUM_ACC (int64_t(24)) +#define EIGEN_AVX_MAX_NUM_ROW (int64_t(8)) // Denoted L in code. +#define EIGEN_AVX_MAX_K_UNROL (int64_t(4)) +#define EIGEN_AVX_B_LOAD_SETS (int64_t(2)) +#define EIGEN_AVX_MAX_A_BCAST (int64_t(2)) +typedef Packet16f vecFullFloat; +typedef Packet8d vecFullDouble; +typedef Packet8f vecHalfFloat; +typedef Packet4d vecHalfDouble; + +// Compile-time unrolls are implemented here. +// Note: this depends on macros and typedefs above. +#include "TrsmUnrolls.inc" + +#if (EIGEN_USE_AVX512_TRSM_KERNELS) && (EIGEN_COMP_CLANG != 0) +/** + * For smaller problem sizes, and certain compilers, using the optimized kernels trsmKernelL/R directly + * is faster than the packed versions in TriangularSolverMatrix.h. + * + * The current heuristic is based on having having all arrays used in the largest gemm-update + * in triSolve fit in roughly L2Cap (percentage) of the L2 cache. These cutoffs are a bit conservative and could be + * larger for some trsm cases. + * The formula: + * + * (L*M + M*N + L*N)*sizeof(Scalar) < L2Cache*L2Cap + * + * L = number of rows to solve at a time + * N = number of rhs + * M = Dimension of triangular matrix + * + */ +#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS) +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 1 +#endif + +#if EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS + +#if EIGEN_USE_AVX512_TRSM_R_KERNELS +#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS) +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 1 +#endif // !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS) +#endif + +#if EIGEN_USE_AVX512_TRSM_L_KERNELS +#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS) +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 1 +#endif +#endif // EIGEN_USE_AVX512_TRSM_L_KERNELS + +#else // EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS == 0 +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0 +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0 +#endif // EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS + +template +int64_t avx512_trsm_cutoff(int64_t L2Size, int64_t N, double L2Cap) { + const int64_t U3 = 3 * packet_traits::size; + const int64_t MaxNb = 5 * U3; + int64_t Nb = std::min(MaxNb, N); + double cutoff_d = + (((L2Size * L2Cap) / (sizeof(Scalar))) - (EIGEN_AVX_MAX_NUM_ROW)*Nb) / ((EIGEN_AVX_MAX_NUM_ROW) + Nb); + int64_t cutoff_l = static_cast(cutoff_d); + return (cutoff_l / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW; +} +#else // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0) +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 0 +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0 +#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0 +#endif + +/** + * Used by gemmKernel for the case A/B row-major and C col-major. + */ +template +static EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock &zmm, + Scalar *C_arr, int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) { + EIGEN_UNUSED_VARIABLE(remN_); + EIGEN_UNUSED_VARIABLE(remM_); + using urolls = unrolls::trans; + + constexpr int64_t U3 = urolls::PacketSize * 3; + constexpr int64_t U2 = urolls::PacketSize * 2; + constexpr int64_t U1 = urolls::PacketSize * 1; + + static_assert(unrollN == U1 || unrollN == U2 || unrollN == U3, "unrollN should be a multiple of PacketSize"); + static_assert(unrollM == EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW"); + + urolls::template transpose(zmm); + EIGEN_IF_CONSTEXPR(unrollN > U2) urolls::template transpose(zmm); + EIGEN_IF_CONSTEXPR(unrollN > U1) urolls::template transpose(zmm); + + static_assert((remN && unrollN == U1) || !remN, "When handling N remainder set unrollN=U1"); + EIGEN_IF_CONSTEXPR(!remN) { + urolls::template storeC(C_arr, LDC, zmm, remM_); + EIGEN_IF_CONSTEXPR(unrollN > U1) { + constexpr int64_t unrollN_ = std::min(unrollN - U1, U1); + urolls::template storeC(C_arr + U1 * LDC, LDC, zmm, remM_); + } + EIGEN_IF_CONSTEXPR(unrollN > U2) { + constexpr int64_t unrollN_ = std::min(unrollN - U2, U1); + urolls::template storeC(C_arr + U2 * LDC, LDC, zmm, remM_); + } + } + else { + EIGEN_IF_CONSTEXPR((std::is_same::value)) { + // Note: without "if constexpr" this section of code will also be + // parsed by the compiler so each of the storeC will still be instantiated. + // We use enable_if in aux_storeC to set it to an empty function for + // these cases. + if (remN_ == 15) + urolls::template storeC<15, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 14) + urolls::template storeC<14, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 13) + urolls::template storeC<13, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 12) + urolls::template storeC<12, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 11) + urolls::template storeC<11, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 10) + urolls::template storeC<10, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 9) + urolls::template storeC<9, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 8) + urolls::template storeC<8, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 7) + urolls::template storeC<7, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 6) + urolls::template storeC<6, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 5) + urolls::template storeC<5, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 4) + urolls::template storeC<4, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 3) + urolls::template storeC<3, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 2) + urolls::template storeC<2, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 1) + urolls::template storeC<1, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + } + else { + if (remN_ == 7) + urolls::template storeC<7, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 6) + urolls::template storeC<6, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 5) + urolls::template storeC<5, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 4) + urolls::template storeC<4, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 3) + urolls::template storeC<3, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 2) + urolls::template storeC<2, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + else if (remN_ == 1) + urolls::template storeC<1, unrollN, 0, remM>(C_arr, LDC, zmm, remM_); + } + } +} + +/** + * GEMM like operation for trsm panel updates. + * Computes: C -= A*B + * K must be multipe of 4. + * + * Unrolls used are {1,2,4,8}x{U1,U2,U3}; + * For good performance we want K to be large with M/N relatively small, but also large enough + * to use the {8,U3} unroll block. + * + * isARowMajor: is A_arr row-major? + * isCRowMajor: is C_arr row-major? (B_arr is assumed to be row-major). + * isAdd: C += A*B or C -= A*B (used by trsm) + * handleKRem: Handle arbitrary K? This is not needed for trsm. + */ +template +void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB, + int64_t LDC) { + using urolls = unrolls::gemm; + constexpr int64_t U3 = urolls::PacketSize * 3; + constexpr int64_t U2 = urolls::PacketSize * 2; + constexpr int64_t U1 = urolls::PacketSize * 1; + using vec = typename std::conditional::value, vecFullFloat, vecFullDouble>::type; + int64_t N_ = (N / U3) * U3; + int64_t M_ = (M / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW; + int64_t K_ = (K / EIGEN_AVX_MAX_K_UNROL) * EIGEN_AVX_MAX_K_UNROL; + int64_t j = 0; + for (; j < N_; j += U3) { + constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 3; + int64_t i = 0; + for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<3, EIGEN_AVX_MAX_NUM_ROW>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<3, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<3, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC); + } + } + if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<3, 4>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel( + B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<3, 4>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<3, 4>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 4); + } + i += 4; + } + if (M - i >= 2) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<3, 2>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel( + B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<3, 2>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<3, 2>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 2); + } + i += 2; + } + if (M - i > 0) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<3, 1>(zmm); + { + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel( + B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<3, 1>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<3, 1>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 1); + } + } + } + } + if (N - j >= U2) { + constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 2; + int64_t i = 0; + for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j]; + EIGEN_IF_CONSTEXPR(isCRowMajor) B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<2, EIGEN_AVX_MAX_NUM_ROW>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<2, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<2, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC); + } + } + if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<2, 4>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, + LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<2, 4>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<2, 4>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 4); + } + i += 4; + } + if (M - i >= 2) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<2, 2>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, + LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<2, 2>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<2, 2>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 2); + } + i += 2; + } + if (M - i > 0) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<2, 1>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, + LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<2, 1>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<2, 1>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 1); + } + } + j += U2; + } + if (N - j >= U1) { + constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1; + int64_t i = 0; + for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<1, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC); + } + } + if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, 4>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, + LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, 4>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<1, 4>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 4); + } + i += 4; + } + if (M - i >= 2) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, 2>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, + LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, 2>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<1, 2>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 2); + } + i += 2; + } + if (M - i > 0) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, 1>(zmm); + { + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, + LDA, zmm); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, 1>(&C_arr[i * LDC + j], LDC, zmm); + urolls::template storeC<1, 1>(&C_arr[i * LDC + j], LDC, zmm); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 1); + } + } + } + j += U1; + } + if (N - j > 0) { + constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1; + int64_t i = 0; + for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm, N - j); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm, N - j); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, EIGEN_AVX_MAX_NUM_ROW, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + urolls::template storeC<1, EIGEN_AVX_MAX_NUM_ROW, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 0, N - j); + } + } + if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, 4>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm, N - j); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel( + B_t, A_t, LDB, LDA, zmm, N - j); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, 4, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + urolls::template storeC<1, 4, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 4, N - j); + } + i += 4; + } + if (M - i >= 2) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, 2>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm, N - j); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel( + B_t, A_t, LDB, LDA, zmm, N - j); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, 2, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + urolls::template storeC<1, 2, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 2, N - j); + } + i += 2; + } + if (M - i > 0) { + Scalar *A_t = &A_arr[idA(i, 0, LDA)]; + Scalar *B_t = &B_arr[0 * LDB + j]; + PacketBlock zmm; + urolls::template setzero<1, 1>(zmm); + for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) { + urolls::template microKernel( + B_t, A_t, LDB, LDA, zmm, N - j); + B_t += EIGEN_AVX_MAX_K_UNROL * LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL; + else A_t += EIGEN_AVX_MAX_K_UNROL * LDA; + } + EIGEN_IF_CONSTEXPR(handleKRem) { + for (int64_t k = K_; k < K; k++) { + urolls::template microKernel(B_t, A_t, LDB, LDA, zmm, + N - j); + B_t += LDB; + EIGEN_IF_CONSTEXPR(isARowMajor) A_t++; + else A_t += LDA; + } + } + EIGEN_IF_CONSTEXPR(isCRowMajor) { + urolls::template updateC<1, 1, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + urolls::template storeC<1, 1, true>(&C_arr[i * LDC + j], LDC, zmm, N - j); + } + else { + transStoreC(zmm, &C_arr[i + j * LDC], LDC, 1, N - j); + } + } + } +} + +/** + * Triangular solve kernel with A on left with K number of rhs. dim(A) = unrollM + * + * unrollM: dimension of A matrix (triangular matrix). unrollM should be <= EIGEN_AVX_MAX_NUM_ROW + * isFWDSolve: is forward solve? + * isUnitDiag: is the diagonal of A all ones? + * The B matrix (RHS) is assumed to be row-major + */ +template +static EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K, int64_t LDA, int64_t LDB) { + static_assert(unrollM <= EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW"); + using urolls = unrolls::trsm; + constexpr int64_t U3 = urolls::PacketSize * 3; + constexpr int64_t U2 = urolls::PacketSize * 2; + constexpr int64_t U1 = urolls::PacketSize * 1; + + PacketBlock RHSInPacket; + PacketBlock AInPacket; + + int64_t k = 0; + while (K - k >= U3) { + urolls::template loadRHS(B_arr + k, LDB, RHSInPacket); + urolls::template triSolveMicroKernel(A_arr, LDA, RHSInPacket, + AInPacket); + urolls::template storeRHS(B_arr + k, LDB, RHSInPacket); + k += U3; + } + if (K - k >= U2) { + urolls::template loadRHS(B_arr + k, LDB, RHSInPacket); + urolls::template triSolveMicroKernel(A_arr, LDA, RHSInPacket, + AInPacket); + urolls::template storeRHS(B_arr + k, LDB, RHSInPacket); + k += U2; + } + if (K - k >= U1) { + urolls::template loadRHS(B_arr + k, LDB, RHSInPacket); + urolls::template triSolveMicroKernel(A_arr, LDA, RHSInPacket, + AInPacket); + urolls::template storeRHS(B_arr + k, LDB, RHSInPacket); + k += U1; + } + if (K - k > 0) { + // Handle remaining number of RHS + urolls::template loadRHS(B_arr + k, LDB, RHSInPacket, K - k); + urolls::template triSolveMicroKernel(A_arr, LDA, RHSInPacket, + AInPacket); + urolls::template storeRHS(B_arr + k, LDB, RHSInPacket, K - k); + } +} + +/** + * Triangular solve routine with A on left and dimension of at most L with K number of rhs. This is essentially + * a wrapper for triSolveMicrokernel for M = {1,2,3,4,5,6,7,8}. + * + * isFWDSolve: is forward solve? + * isUnitDiag: is the diagonal of A all ones? + * The B matrix (RHS) is assumed to be row-major + */ +template +void triSolveKernelLxK(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t K, int64_t LDA, int64_t LDB) { + // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted + // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller. + using vec = typename std::conditional::value, vecFullFloat, vecFullDouble>::type; + if (M == 8) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + else if (M == 7) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + else if (M == 6) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + else if (M == 5) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + else if (M == 4) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + else if (M == 3) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + else if (M == 2) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + else if (M == 1) + triSolveKernel(A_arr, B_arr, K, LDA, LDB); + return; +} + +/** + * This routine is used to copy B to/from a temporary array (row-major) for cases where B is column-major. + * + * toTemp: true => copy to temporary array, false => copy from temporary array + * remM: true = need to handle remainder values for M (M < EIGEN_AVX_MAX_NUM_ROW) + * + */ +template +static EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K, Scalar *B_temp, int64_t LDB_, + int64_t remM_ = 0) { + EIGEN_UNUSED_VARIABLE(remM_); + using urolls = unrolls::transB; + using vecHalf = typename std::conditional::value, vecHalfFloat, vecFullDouble>::type; + PacketBlock ymm; + constexpr int64_t U3 = urolls::PacketSize * 3; + constexpr int64_t U2 = urolls::PacketSize * 2; + constexpr int64_t U1 = urolls::PacketSize * 1; + int64_t K_ = K / U3 * U3; + int64_t k = 0; + + for (; k < K_; k += U3) { + urolls::template transB_kernel(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_); + B_temp += U3; + } + if (K - k >= U2) { + urolls::template transB_kernel(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_); + B_temp += U2; + k += U2; + } + if (K - k >= U1) { + urolls::template transB_kernel(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_); + B_temp += U1; + k += U1; + } + EIGEN_IF_CONSTEXPR(U1 > 8) { + // Note: without "if constexpr" this section of code will also be + // parsed by the compiler so there is an additional check in {load/store}BBlock + // to make sure the counter is not non-negative. + if (K - k >= 8) { + urolls::template transB_kernel<8, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_); + B_temp += 8; + k += 8; + } + } + EIGEN_IF_CONSTEXPR(U1 > 4) { + // Note: without "if constexpr" this section of code will also be + // parsed by the compiler so there is an additional check in {load/store}BBlock + // to make sure the counter is not non-negative. + if (K - k >= 4) { + urolls::template transB_kernel<4, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_); + B_temp += 4; + k += 4; + } + } + if (K - k >= 2) { + urolls::template transB_kernel<2, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_); + B_temp += 2; + k += 2; + } + if (K - k >= 1) { + urolls::template transB_kernel<1, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_); + B_temp += 1; + k += 1; + } +} + +#if (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC) +/** + * Reduce blocking sizes so that the size of the temporary workspace needed is less than "limit" bytes, + * - kB must be at least psize + * - numM must be at least EIGEN_AVX_MAX_NUM_ROW + */ +template +constexpr std::pair trsmBlocking(const int64_t limit) { + constexpr int64_t psize = packet_traits::size; + int64_t kB = 15 * psize; + int64_t numM = 8 * EIGEN_AVX_MAX_NUM_ROW; + // If B is rowmajor, no temp workspace needed, so use default blocking sizes. + if (isBRowMajor) return {kB, numM}; + + // Very simple heuristic, prefer keeping kB as large as possible to fully use vector registers. + for (int64_t k = kB; k > psize; k -= psize) { + for (int64_t m = numM; m > EIGEN_AVX_MAX_NUM_ROW; m -= EIGEN_AVX_MAX_NUM_ROW) { + if ((((k + psize - 1) / psize + 4) * psize) * m * sizeof(Scalar) < limit) { + return {k, m}; + } + } + } + return {psize, EIGEN_AVX_MAX_NUM_ROW}; // Minimum blocking size required +} +#endif // (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC) + +/** + * Main triangular solve driver + * + * Triangular solve with A on the left. + * Scalar: Scalar precision, only float/double is supported. + * isARowMajor: is A row-major? + * isBRowMajor: is B row-major? + * isFWDSolve: is this forward solve or backward (true => forward)? + * isUnitDiag: is diagonal of A unit or nonunit (true => A has unit diagonal)? + * + * M: dimension of A + * numRHS: number of right hand sides (coincides with K dimension for gemm updates) + * + * Here are the mapping between the different TRSM cases (col-major) and triSolve: + * + * LLN (left , lower, A non-transposed) :: isARowMajor=false, isBRowMajor=false, isFWDSolve=true + * LUT (left , upper, A transposed) :: isARowMajor=true, isBRowMajor=false, isFWDSolve=true + * LUN (left , upper, A non-transposed) :: isARowMajor=false, isBRowMajor=false, isFWDSolve=false + * LLT (left , lower, A transposed) :: isARowMajor=true, isBRowMajor=false, isFWDSolve=false + * RUN (right, upper, A non-transposed) :: isARowMajor=true, isBRowMajor=true, isFWDSolve=true + * RLT (right, lower, A transposed) :: isARowMajor=false, isBRowMajor=true, isFWDSolve=true + * RUT (right, upper, A transposed) :: isARowMajor=false, isBRowMajor=true, isFWDSolve=false + * RLN (right, lower, A non-transposed) :: isARowMajor=true, isBRowMajor=true, isFWDSolve=false + * + * Note: For RXX cases M,numRHS should be swapped. + * + */ +template +void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t LDA, int64_t LDB) { + constexpr int64_t psize = packet_traits::size; + /** + * The values for kB, numM were determined experimentally. + * kB: Number of RHS we process at a time. + * numM: number of rows of B we will store in a temporary array (see below.) This should be a multiple of L. + * + * kB was determined by initially setting kB = numRHS and benchmarking triSolve (TRSM-RUN case) + * performance with M=numRHS. + * It was observed that performance started to drop around M=numRHS=240. This is likely machine dependent. + * + * numM was chosen "arbitrarily". It should be relatively small so B_temp is not too large, but it should be + * large enough to allow GEMM updates to have larger "K"s (see below.) No benchmarking has been done so far to + * determine optimal values for numM. + */ +#if (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC) + /** + * If EIGEN_NO_MALLOC is requested, we try to reduce kB and numM so the maximum temp workspace required is less + * than EIGEN_STACK_ALLOCATION_LIMIT. Actual workspace size may be less, depending on the number of vectors to + * solve. + * - kB must be at least psize + * - numM must be at least EIGEN_AVX_MAX_NUM_ROW + * + * If B is row-major, the blocking sizes are not reduced (no temp workspace needed). + */ + constexpr std::pair blocking_ = trsmBlocking(EIGEN_STACK_ALLOCATION_LIMIT); + constexpr int64_t kB = blocking_.first; + constexpr int64_t numM = blocking_.second; + /** + * If the temp workspace size exceeds EIGEN_STACK_ALLOCATION_LIMIT even with the minimum blocking sizes, + * we throw an assertion. Use -DEIGEN_USE_AVX512_TRSM_L_KERNELS=0 if necessary + */ + static_assert(!(((((kB + psize - 1) / psize + 4) * psize) * numM * sizeof(Scalar) >= EIGEN_STACK_ALLOCATION_LIMIT) && + !isBRowMajor), + "Temp workspace required is too large."); +#else + constexpr int64_t kB = (3 * psize) * 5; // 5*U3 + constexpr int64_t numM = 8 * EIGEN_AVX_MAX_NUM_ROW; +#endif + + int64_t sizeBTemp = 0; + Scalar *B_temp = NULL; + EIGEN_IF_CONSTEXPR(!isBRowMajor) { + /** + * If B is col-major, we copy it to a fixed-size temporary array of size at most ~numM*kB and + * transpose it to row-major. Call the solve routine, and copy+transpose it back to the original array. + * The updated row-major copy of B is reused in the GEMM updates. + */ + sizeBTemp = (((std::min(kB, numRHS) + psize - 1) / psize + 4) * psize) * numM; + } + +#if !defined(EIGEN_NO_MALLOC) + EIGEN_IF_CONSTEXPR(!isBRowMajor) B_temp = (Scalar *)handmade_aligned_malloc(sizeof(Scalar) * sizeBTemp, 64); +#elif (EIGEN_USE_AVX512_TRSM_L_KERNELS) && defined(EIGEN_NO_MALLOC) + // Use alloca if malloc not allowed, requested temp workspace size should be less than EIGEN_STACK_ALLOCATION_LIMIT + ei_declare_aligned_stack_constructed_variable(Scalar, B_temp_alloca, sizeBTemp, 0); + B_temp = B_temp_alloca; +#endif + + for (int64_t k = 0; k < numRHS; k += kB) { + int64_t bK = numRHS - k > kB ? kB : numRHS - k; + int64_t M_ = (M / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW, gemmOff = 0; + + // bK rounded up to next multiple of L=EIGEN_AVX_MAX_NUM_ROW. When B_temp is used, we solve for bkL RHS + // instead of bK RHS in triSolveKernelLxK. + int64_t bkL = ((bK + (EIGEN_AVX_MAX_NUM_ROW - 1)) / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW; + const int64_t numScalarPerCache = 64 / sizeof(Scalar); + // Leading dimension of B_temp, will be a multiple of the cache line size. + int64_t LDT = ((bkL + (numScalarPerCache - 1)) / numScalarPerCache) * numScalarPerCache; + int64_t offsetBTemp = 0; + for (int64_t i = 0; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) { + EIGEN_IF_CONSTEXPR(!isBRowMajor) { + int64_t indA_i = isFWDSolve ? i : M - 1 - i; + int64_t indB_i = isFWDSolve ? i : M - (i + EIGEN_AVX_MAX_NUM_ROW); + int64_t offB_1 = isFWDSolve ? offsetBTemp : sizeBTemp - EIGEN_AVX_MAX_NUM_ROW * LDT - offsetBTemp; + int64_t offB_2 = isFWDSolve ? offsetBTemp : sizeBTemp - LDT - offsetBTemp; + // Copy values from B to B_temp. + copyBToRowMajor(B_arr + indB_i + k * LDB, LDB, bK, B_temp + offB_1, LDT); + // Triangular solve with a small block of A and long horizontal blocks of B (or B_temp if B col-major) + triSolveKernelLxK( + &A_arr[idA(indA_i, indA_i, LDA)], B_temp + offB_2, EIGEN_AVX_MAX_NUM_ROW, bkL, LDA, LDT); + // Copy values from B_temp back to B. B_temp will be reused in gemm call below. + copyBToRowMajor(B_arr + indB_i + k * LDB, LDB, bK, B_temp + offB_1, LDT); + + offsetBTemp += EIGEN_AVX_MAX_NUM_ROW * LDT; + } + else { + int64_t ind = isFWDSolve ? i : M - 1 - i; + triSolveKernelLxK( + &A_arr[idA(ind, ind, LDA)], B_arr + k + ind * LDB, EIGEN_AVX_MAX_NUM_ROW, bK, LDA, LDB); + } + if (i + EIGEN_AVX_MAX_NUM_ROW < M_) { + /** + * For the GEMM updates, we want "K" (K=i+8 in this case) to be large as soon as possible + * to reuse the accumulators in GEMM as much as possible. So we only update 8xbK blocks of + * B as follows: + * + * A B + * __ + * |__|__ |__| + * |__|__|__ |__| + * |__|__|__|__ |__| + * |********|__| |**| + */ + EIGEN_IF_CONSTEXPR(isBRowMajor) { + int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW); + int64_t indA_j = isFWDSolve ? 0 : M - (i + EIGEN_AVX_MAX_NUM_ROW); + int64_t indB_i = isFWDSolve ? 0 : M - (i + EIGEN_AVX_MAX_NUM_ROW); + int64_t indB_i2 = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW); + gemmKernel( + &A_arr[idA(indA_i, indA_j, LDA)], B_arr + k + indB_i * LDB, B_arr + k + indB_i2 * LDB, + EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW, LDA, LDB, LDB); + } + else { + if (offsetBTemp + EIGEN_AVX_MAX_NUM_ROW * LDT > sizeBTemp) { + /** + * Similar idea as mentioned above, but here we are limited by the number of updated values of B + * that can be stored (row-major) in B_temp. + * + * If there is not enough space to store the next batch of 8xbK of B in B_temp, we call GEMM + * update and partially update the remaining old values of B which depends on the new values + * of B stored in B_temp. These values are then no longer needed and can be overwritten. + */ + int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : 0; + int64_t indA_j = isFWDSolve ? gemmOff : M - (i + EIGEN_AVX_MAX_NUM_ROW); + int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : 0; + int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp; + gemmKernel( + &A_arr[idA(indA_i, indA_j, LDA)], B_temp + offB_1, B_arr + indB_i + (k)*LDB, + M - (i + EIGEN_AVX_MAX_NUM_ROW), bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff, LDA, LDT, LDB); + offsetBTemp = 0; + gemmOff = i + EIGEN_AVX_MAX_NUM_ROW; + } else { + /** + * If there is enough space in B_temp, we only update the next 8xbK values of B. + */ + int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW); + int64_t indA_j = isFWDSolve ? gemmOff : M - (i + EIGEN_AVX_MAX_NUM_ROW); + int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW); + int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp; + gemmKernel( + &A_arr[idA(indA_i, indA_j, LDA)], B_temp + offB_1, B_arr + indB_i + (k)*LDB, + EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff, LDA, LDT, LDB); + } + } + } + } + // Handle M remainder.. + int64_t bM = M - M_; + if (bM > 0) { + if (M_ > 0) { + EIGEN_IF_CONSTEXPR(isBRowMajor) { + int64_t indA_i = isFWDSolve ? M_ : 0; + int64_t indA_j = isFWDSolve ? 0 : bM; + int64_t indB_i = isFWDSolve ? 0 : bM; + int64_t indB_i2 = isFWDSolve ? M_ : 0; + gemmKernel( + &A_arr[idA(indA_i, indA_j, LDA)], B_arr + k + indB_i * LDB, B_arr + k + indB_i2 * LDB, bM, + bK, M_, LDA, LDB, LDB); + } + else { + int64_t indA_i = isFWDSolve ? M_ : 0; + int64_t indA_j = isFWDSolve ? gemmOff : bM; + int64_t indB_i = isFWDSolve ? M_ : 0; + int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp; + gemmKernel(&A_arr[idA(indA_i, indA_j, LDA)], + B_temp + offB_1, B_arr + indB_i + (k)*LDB, bM, bK, + M_ - gemmOff, LDA, LDT, LDB); + } + } + EIGEN_IF_CONSTEXPR(!isBRowMajor) { + int64_t indA_i = isFWDSolve ? M_ : M - 1 - M_; + int64_t indB_i = isFWDSolve ? M_ : 0; + int64_t offB_1 = isFWDSolve ? 0 : (bM - 1) * bkL; + copyBToRowMajor(B_arr + indB_i + k * LDB, LDB, bK, B_temp, bkL, bM); + triSolveKernelLxK(&A_arr[idA(indA_i, indA_i, LDA)], + B_temp + offB_1, bM, bkL, LDA, bkL); + copyBToRowMajor(B_arr + indB_i + k * LDB, LDB, bK, B_temp, bkL, bM); + } + else { + int64_t ind = isFWDSolve ? M_ : M - 1 - M_; + triSolveKernelLxK(&A_arr[idA(ind, ind, LDA)], + B_arr + k + ind * LDB, bM, bK, LDA, LDB); + } + } + } + +#if !defined(EIGEN_NO_MALLOC) + EIGEN_IF_CONSTEXPR(!isBRowMajor) handmade_aligned_free(B_temp); +#endif +} + +// Template specializations of trsmKernelL/R for float/double and inner strides of 1. +#if (EIGEN_USE_AVX512_TRSM_KERNELS) +#if (EIGEN_USE_AVX512_TRSM_R_KERNELS) +template +struct trsmKernelR; + +template +struct trsmKernelR { + static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr, + Index otherStride); +}; + +template +struct trsmKernelR { + static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr, + Index otherStride); +}; + +template +EIGEN_DONT_INLINE void trsmKernelR::kernel( + Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr, + Index otherStride) { + EIGEN_UNUSED_VARIABLE(otherIncr); + triSolve( + const_cast(_tri), _other, size, otherSize, triStride, otherStride); +} + +template +EIGEN_DONT_INLINE void trsmKernelR::kernel( + Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr, + Index otherStride) { + EIGEN_UNUSED_VARIABLE(otherIncr); + triSolve( + const_cast(_tri), _other, size, otherSize, triStride, otherStride); +} +#endif // (EIGEN_USE_AVX512_TRSM_R_KERNELS) + +// These trsm kernels require temporary memory allocation +#if (EIGEN_USE_AVX512_TRSM_L_KERNELS) +template +struct trsmKernelL; + +template +struct trsmKernelL { + static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr, + Index otherStride); +}; + +template +struct trsmKernelL { + static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr, + Index otherStride); +}; + +template +EIGEN_DONT_INLINE void trsmKernelL::kernel( + Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr, + Index otherStride) { + EIGEN_UNUSED_VARIABLE(otherIncr); + triSolve( + const_cast(_tri), _other, size, otherSize, triStride, otherStride); +} + +template +EIGEN_DONT_INLINE void trsmKernelL::kernel( + Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr, + Index otherStride) { + EIGEN_UNUSED_VARIABLE(otherIncr); + triSolve( + const_cast(_tri), _other, size, otherSize, triStride, otherStride); +} +#endif // EIGEN_USE_AVX512_TRSM_L_KERNELS +#endif // EIGEN_USE_AVX512_TRSM_KERNELS +} // namespace internal +} // namespace Eigen +#endif // EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/libs/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc new file mode 100644 index 0000000..6b09424 --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc @@ -0,0 +1,1212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2022 Intel Corporation +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H +#define EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H + +template +static EIGEN_ALWAYS_INLINE int64_t idA(int64_t i, int64_t j, int64_t LDA) { + EIGEN_IF_CONSTEXPR(isARowMajor) return i * LDA + j; + else return i + j * LDA; +} + +/** + * This namespace contains various classes used to generate compile-time unrolls which are + * used throughout the trsm/gemm kernels. The unrolls are characterized as for-loops (1-D), nested + * for-loops (2-D), or triple nested for-loops (3-D). Unrolls are generated using template recursion + * + * Example, the 2-D for-loop is unrolled recursively by first flattening to a 1-D loop. + * + * for(startI = 0; startI < endI; startI++) for(startC = 0; startC < endI*endJ; startC++) + * for(startJ = 0; startJ < endJ; startJ++) ----> startI = (startC)/(endJ) + * func(startI,startJ) startJ = (startC)%(endJ) + * func(...) + * + * The 1-D loop can be unrolled recursively by using enable_if and defining an auxillary function + * with a template parameter used as a counter. + * + * template + * std::enable_if_t<(counter <= 0)> <---- tail case. + * aux_func {} + * + * template + * std::enable_if_t<(counter > 0)> <---- actual for-loop + * aux_func { + * startC = endI*endJ - counter + * startI = (startC)/(endJ) + * startJ = (startC)%(endJ) + * func(startI, startJ) + * aux_func() + * } + * + * Note: Additional wrapper functions are provided for aux_func which hides the counter template + * parameter since counter usually depends on endI, endJ, etc... + * + * Conventions: + * 1) endX: specifies the terminal value for the for-loop, (ex: for(startX = 0; startX < endX; startX++)) + * + * 2) rem, remM, remK template parameters are used for deciding whether to use masked operations for + * handling remaining tails (when sizes are not multiples of PacketSize or EIGEN_AVX_MAX_NUM_ROW) + */ +namespace unrolls { + +template +EIGEN_ALWAYS_INLINE auto remMask(int64_t m) { + EIGEN_IF_CONSTEXPR(N == 16) { return 0xFFFF >> (16 - m); } + else EIGEN_IF_CONSTEXPR(N == 8) { + return 0xFF >> (8 - m); + } + else EIGEN_IF_CONSTEXPR(N == 4) { + return 0x0F >> (4 - m); + } + return 0; +} + +template +EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock &kernel); + +template <> +EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock &kernel) { + __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + + kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2))); + kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2))); + kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3))); + kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3))); + kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6))); + kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6))); + kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7))); + kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7))); + + T0 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[4]), 0x4E)); + T0 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[0], T0); + T4 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[0]), 0x4E)); + T4 = _mm512_mask_blend_ps(0xF0F0, T4, kernel.packet[4]); + T1 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[5]), 0x4E)); + T1 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[1], T1); + T5 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[1]), 0x4E)); + T5 = _mm512_mask_blend_ps(0xF0F0, T5, kernel.packet[5]); + T2 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[6]), 0x4E)); + T2 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[2], T2); + T6 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[2]), 0x4E)); + T6 = _mm512_mask_blend_ps(0xF0F0, T6, kernel.packet[6]); + T3 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[7]), 0x4E)); + T3 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[3], T3); + T7 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[3]), 0x4E)); + T7 = _mm512_mask_blend_ps(0xF0F0, T7, kernel.packet[7]); + + kernel.packet[0] = T0; + kernel.packet[1] = T1; + kernel.packet[2] = T2; + kernel.packet[3] = T3; + kernel.packet[4] = T4; + kernel.packet[5] = T5; + kernel.packet[6] = T6; + kernel.packet[7] = T7; +} + +template <> +EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock &kernel) { + ptranspose(kernel); +} + +/*** + * Unrolls for tranposed C stores + */ +template +class trans { + public: + using vec = typename std::conditional::value, vecFullFloat, vecFullDouble>::type; + using vecHalf = typename std::conditional::value, vecHalfFloat, vecFullDouble>::type; + static constexpr int64_t PacketSize = packet_traits::size; + + /*********************************** + * Auxillary Functions for: + * - storeC + *********************************** + */ + + /** + * aux_storeC + * + * 1-D unroll + * for(startN = 0; startN < endN; startN++) + * + * (endN <= PacketSize) is required to handle the fp32 case, see comments in transStoreC + * + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && endN <= PacketSize)> aux_storeC( + Scalar *C_arr, int64_t LDC, PacketBlock &zmm, int64_t remM_ = 0) { + constexpr int64_t counterReverse = endN - counter; + constexpr int64_t startN = counterReverse; + + EIGEN_IF_CONSTEXPR(startN < EIGEN_AVX_MAX_NUM_ROW) { + EIGEN_IF_CONSTEXPR(remM) { + pstoreu( + C_arr + LDC * startN, + padd(ploadu((const Scalar *)C_arr + LDC * startN, remMask(remM_)), + preinterpret(zmm.packet[packetIndexOffset + (unrollN / PacketSize) * startN]), + remMask(remM_)), + remMask(remM_)); + } + else { + pstoreu(C_arr + LDC * startN, + padd(ploadu((const Scalar *)C_arr + LDC * startN), + preinterpret(zmm.packet[packetIndexOffset + (unrollN / PacketSize) * startN]))); + } + } + else { // This block is only needed for fp32 case + // Reinterpret as __m512 for _mm512_shuffle_f32x4 + vecFullFloat zmm2vecFullFloat = preinterpret( + zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)]); + // Swap lower and upper half of avx register. + zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)] = + preinterpret(_mm512_shuffle_f32x4(zmm2vecFullFloat, zmm2vecFullFloat, 0b01001110)); + + EIGEN_IF_CONSTEXPR(remM) { + pstoreu( + C_arr + LDC * startN, + padd(ploadu((const Scalar *)C_arr + LDC * startN, remMask(remM_)), + preinterpret( + zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)])), + remMask(remM_)); + } + else { + pstoreu( + C_arr + LDC * startN, + padd(ploadu((const Scalar *)C_arr + LDC * startN), + preinterpret( + zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)]))); + } + } + aux_storeC(C_arr, LDC, zmm, remM_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t 0 && endN <= PacketSize)> aux_storeC( + Scalar *C_arr, int64_t LDC, PacketBlock &zmm, int64_t remM_ = 0) { + EIGEN_UNUSED_VARIABLE(C_arr); + EIGEN_UNUSED_VARIABLE(LDC); + EIGEN_UNUSED_VARIABLE(zmm); + EIGEN_UNUSED_VARIABLE(remM_); + } + + template + static EIGEN_ALWAYS_INLINE void storeC(Scalar *C_arr, int64_t LDC, + PacketBlock &zmm, + int64_t remM_ = 0) { + aux_storeC(C_arr, LDC, zmm, remM_); + } + + /** + * Transposes LxunrollN row major block of matrices stored EIGEN_AVX_MAX_NUM_ACC zmm registers to + * "unrollN"xL ymm registers to be stored col-major into C. + * + * For 8x48, the 8x48 block (row-major) is stored in zmm as follows: + * + * row0: zmm0 zmm1 zmm2 + * row1: zmm3 zmm4 zmm5 + * . + * . + * row7: zmm21 zmm22 zmm23 + * + * For 8x32, the 8x32 block (row-major) is stored in zmm as follows: + * + * row0: zmm0 zmm1 + * row1: zmm2 zmm3 + * . + * . + * row7: zmm14 zmm15 + * + * + * In general we will have {1,2,3} groups of avx registers each of size + * EIGEN_AVX_MAX_NUM_ROW. packetIndexOffset is used to select which "block" of + * avx registers are being transposed. + */ + template + static EIGEN_ALWAYS_INLINE void transpose(PacketBlock &zmm) { + // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted + // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller. + constexpr int64_t zmmStride = unrollN / PacketSize; + PacketBlock r; + r.packet[0] = zmm.packet[packetIndexOffset + zmmStride * 0]; + r.packet[1] = zmm.packet[packetIndexOffset + zmmStride * 1]; + r.packet[2] = zmm.packet[packetIndexOffset + zmmStride * 2]; + r.packet[3] = zmm.packet[packetIndexOffset + zmmStride * 3]; + r.packet[4] = zmm.packet[packetIndexOffset + zmmStride * 4]; + r.packet[5] = zmm.packet[packetIndexOffset + zmmStride * 5]; + r.packet[6] = zmm.packet[packetIndexOffset + zmmStride * 6]; + r.packet[7] = zmm.packet[packetIndexOffset + zmmStride * 7]; + trans8x8blocks(r); + zmm.packet[packetIndexOffset + zmmStride * 0] = r.packet[0]; + zmm.packet[packetIndexOffset + zmmStride * 1] = r.packet[1]; + zmm.packet[packetIndexOffset + zmmStride * 2] = r.packet[2]; + zmm.packet[packetIndexOffset + zmmStride * 3] = r.packet[3]; + zmm.packet[packetIndexOffset + zmmStride * 4] = r.packet[4]; + zmm.packet[packetIndexOffset + zmmStride * 5] = r.packet[5]; + zmm.packet[packetIndexOffset + zmmStride * 6] = r.packet[6]; + zmm.packet[packetIndexOffset + zmmStride * 7] = r.packet[7]; + } +}; + +/** + * Unrolls for copyBToRowMajor + * + * Idea: + * 1) Load a block of right-hand sides to registers (using loadB). + * 2) Convert the block from column-major to row-major (transposeLxL) + * 3) Store the blocks from register either to a temp array (toTemp == true), or back to B (toTemp == false). + * + * We use at most EIGEN_AVX_MAX_NUM_ACC avx registers to store the blocks of B. The remaining registers are + * used as temps for transposing. + * + * Blocks will be of size Lx{U1,U2,U3}. packetIndexOffset is used to index between these subblocks + * For fp32, PacketSize = 2*EIGEN_AVX_MAX_NUM_ROW, so we reinterpret packets as packets half the size (zmm -> ymm). + */ +template +class transB { + public: + using vec = typename std::conditional::value, vecFullFloat, vecFullDouble>::type; + using vecHalf = typename std::conditional::value, vecHalfFloat, vecFullDouble>::type; + static constexpr int64_t PacketSize = packet_traits::size; + + /*********************************** + * Auxillary Functions for: + * - loadB + * - storeB + * - loadBBlock + * - storeBBlock + *********************************** + */ + + /** + * aux_loadB + * + * 1-D unroll + * for(startN = 0; startN < endN; startN++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB( + Scalar *B_arr, int64_t LDB, PacketBlock &ymm, + int64_t remM_ = 0) { + constexpr int64_t counterReverse = endN - counter; + constexpr int64_t startN = counterReverse; + + EIGEN_IF_CONSTEXPR(remM) { + ymm.packet[packetIndexOffset + startN] = + ploadu((const Scalar *)&B_arr[startN * LDB], remMask(remM_)); + } + else ymm.packet[packetIndexOffset + startN] = ploadu((const Scalar *)&B_arr[startN * LDB]); + + aux_loadB(B_arr, LDB, ymm, remM_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadB( + Scalar *B_arr, int64_t LDB, PacketBlock &ymm, + int64_t remM_ = 0) { + EIGEN_UNUSED_VARIABLE(B_arr); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(ymm); + EIGEN_UNUSED_VARIABLE(remM_); + } + + /** + * aux_storeB + * + * 1-D unroll + * for(startN = 0; startN < endN; startN++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeB( + Scalar *B_arr, int64_t LDB, PacketBlock &ymm, int64_t rem_ = 0) { + constexpr int64_t counterReverse = endN - counter; + constexpr int64_t startN = counterReverse; + + EIGEN_IF_CONSTEXPR(remK || remM) { + pstoreu(&B_arr[startN * LDB], ymm.packet[packetIndexOffset + startN], + remMask(rem_)); + } + else { + pstoreu(&B_arr[startN * LDB], ymm.packet[packetIndexOffset + startN]); + } + + aux_storeB(B_arr, LDB, ymm, rem_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeB( + Scalar *B_arr, int64_t LDB, PacketBlock &ymm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(B_arr); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(ymm); + EIGEN_UNUSED_VARIABLE(rem_); + } + + /** + * aux_loadBBlock + * + * 1-D unroll + * for(startN = 0; startN < endN; startN += EIGEN_AVX_MAX_NUM_ROW) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadBBlock( + Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_, + PacketBlock &ymm, int64_t remM_ = 0) { + constexpr int64_t counterReverse = endN - counter; + constexpr int64_t startN = counterReverse; + transB::template loadB(&B_temp[startN], LDB_, ymm); + aux_loadBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadBBlock( + Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_, + PacketBlock &ymm, int64_t remM_ = 0) { + EIGEN_UNUSED_VARIABLE(B_arr); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(B_temp); + EIGEN_UNUSED_VARIABLE(LDB_); + EIGEN_UNUSED_VARIABLE(ymm); + EIGEN_UNUSED_VARIABLE(remM_); + } + + /** + * aux_storeBBlock + * + * 1-D unroll + * for(startN = 0; startN < endN; startN += EIGEN_AVX_MAX_NUM_ROW) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeBBlock( + Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_, + PacketBlock &ymm, int64_t remM_ = 0) { + constexpr int64_t counterReverse = endN - counter; + constexpr int64_t startN = counterReverse; + + EIGEN_IF_CONSTEXPR(toTemp) { + transB::template storeB(&B_temp[startN], LDB_, ymm, remK_); + } + else { + transB::template storeB(&B_arr[0 + startN * LDB], LDB, + ymm, remM_); + } + aux_storeBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeBBlock( + Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_, + PacketBlock &ymm, int64_t remM_ = 0) { + EIGEN_UNUSED_VARIABLE(B_arr); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(B_temp); + EIGEN_UNUSED_VARIABLE(LDB_); + EIGEN_UNUSED_VARIABLE(ymm); + EIGEN_UNUSED_VARIABLE(remM_); + } + + /******************************************************** + * Wrappers for aux_XXXX to hide counter parameter + ********************************************************/ + + template + static EIGEN_ALWAYS_INLINE void loadB(Scalar *B_arr, int64_t LDB, + PacketBlock &ymm, + int64_t remM_ = 0) { + aux_loadB(B_arr, LDB, ymm, remM_); + } + + template + static EIGEN_ALWAYS_INLINE void storeB(Scalar *B_arr, int64_t LDB, + PacketBlock &ymm, + int64_t rem_ = 0) { + aux_storeB(B_arr, LDB, ymm, rem_); + } + + template + static EIGEN_ALWAYS_INLINE void loadBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_, + PacketBlock &ymm, + int64_t remM_ = 0) { + EIGEN_IF_CONSTEXPR(toTemp) { transB::template loadB(&B_arr[0], LDB, ymm, remM_); } + else { + aux_loadBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + } + + template + static EIGEN_ALWAYS_INLINE void storeBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_, + PacketBlock &ymm, + int64_t remM_ = 0) { + aux_storeBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + + template + static EIGEN_ALWAYS_INLINE void transposeLxL(PacketBlock &ymm) { + // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted + // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller. + PacketBlock r; + r.packet[0] = ymm.packet[packetIndexOffset + 0]; + r.packet[1] = ymm.packet[packetIndexOffset + 1]; + r.packet[2] = ymm.packet[packetIndexOffset + 2]; + r.packet[3] = ymm.packet[packetIndexOffset + 3]; + r.packet[4] = ymm.packet[packetIndexOffset + 4]; + r.packet[5] = ymm.packet[packetIndexOffset + 5]; + r.packet[6] = ymm.packet[packetIndexOffset + 6]; + r.packet[7] = ymm.packet[packetIndexOffset + 7]; + ptranspose(r); + ymm.packet[packetIndexOffset + 0] = r.packet[0]; + ymm.packet[packetIndexOffset + 1] = r.packet[1]; + ymm.packet[packetIndexOffset + 2] = r.packet[2]; + ymm.packet[packetIndexOffset + 3] = r.packet[3]; + ymm.packet[packetIndexOffset + 4] = r.packet[4]; + ymm.packet[packetIndexOffset + 5] = r.packet[5]; + ymm.packet[packetIndexOffset + 6] = r.packet[6]; + ymm.packet[packetIndexOffset + 7] = r.packet[7]; + } + + template + static EIGEN_ALWAYS_INLINE void transB_kernel(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_, + PacketBlock &ymm, + int64_t remM_ = 0) { + constexpr int64_t U3 = PacketSize * 3; + constexpr int64_t U2 = PacketSize * 2; + constexpr int64_t U1 = PacketSize * 1; + /** + * Unrolls needed for each case: + * - AVX512 fp32 48 32 16 8 4 2 1 + * - AVX512 fp64 24 16 8 4 2 1 + * + * For fp32 L and U1 are 1:2 so for U3/U2 cases the loads/stores need to be split up. + */ + EIGEN_IF_CONSTEXPR(unrollN == U3) { + // load LxU3 B col major, transpose LxU3 row major + constexpr int64_t maxUBlock = std::min(3 * EIGEN_AVX_MAX_NUM_ROW, U3); + transB::template loadBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template storeBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + + EIGEN_IF_CONSTEXPR(maxUBlock < U3) { + transB::template loadBBlock(&B_arr[maxUBlock * LDB], LDB, &B_temp[maxUBlock], LDB_, + ymm, remM_); + transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template storeBBlock(&B_arr[maxUBlock * LDB], LDB, &B_temp[maxUBlock], LDB_, + ymm, remM_); + } + } + else EIGEN_IF_CONSTEXPR(unrollN == U2) { + // load LxU2 B col major, transpose LxU2 row major + constexpr int64_t maxUBlock = std::min(3 * EIGEN_AVX_MAX_NUM_ROW, U2); + transB::template loadBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + EIGEN_IF_CONSTEXPR(maxUBlock < U2) transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm); + transB::template storeBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + + EIGEN_IF_CONSTEXPR(maxUBlock < U2) { + transB::template loadBBlock(&B_arr[maxUBlock * LDB], LDB, + &B_temp[maxUBlock], LDB_, ymm, remM_); + transB::template transposeLxL<0>(ymm); + transB::template storeBBlock(&B_arr[maxUBlock * LDB], LDB, + &B_temp[maxUBlock], LDB_, ymm, remM_); + } + } + else EIGEN_IF_CONSTEXPR(unrollN == U1) { + // load LxU1 B col major, transpose LxU1 row major + transB::template loadBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + transB::template transposeLxL<0>(ymm); + EIGEN_IF_CONSTEXPR(EIGEN_AVX_MAX_NUM_ROW < U1) { transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm); } + transB::template storeBBlock(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + else EIGEN_IF_CONSTEXPR(unrollN == 8 && U1 > 8) { + // load Lx4 B col major, transpose Lx4 row major + transB::template loadBBlock<8, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + transB::template transposeLxL<0>(ymm); + transB::template storeBBlock<8, toTemp, remM, 8>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + else EIGEN_IF_CONSTEXPR(unrollN == 4 && U1 > 4) { + // load Lx4 B col major, transpose Lx4 row major + transB::template loadBBlock<4, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + transB::template transposeLxL<0>(ymm); + transB::template storeBBlock<4, toTemp, remM, 4>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + else EIGEN_IF_CONSTEXPR(unrollN == 2) { + // load Lx2 B col major, transpose Lx2 row major + transB::template loadBBlock<2, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + transB::template transposeLxL<0>(ymm); + transB::template storeBBlock<2, toTemp, remM, 2>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + else EIGEN_IF_CONSTEXPR(unrollN == 1) { + // load Lx1 B col major, transpose Lx1 row major + transB::template loadBBlock<1, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + transB::template transposeLxL<0>(ymm); + transB::template storeBBlock<1, toTemp, remM, 1>(B_arr, LDB, B_temp, LDB_, ymm, remM_); + } + } +}; + +/** + * Unrolls for triSolveKernel + * + * Idea: + * 1) Load a block of right-hand sides to registers in RHSInPacket (using loadRHS). + * 2) Do triangular solve with RHSInPacket and a small block of A (triangular matrix) + * stored in AInPacket (using triSolveMicroKernel). + * 3) Store final results (in avx registers) back into memory (using storeRHS). + * + * RHSInPacket uses at most EIGEN_AVX_MAX_NUM_ACC avx registers and AInPacket uses at most + * EIGEN_AVX_MAX_NUM_ROW registers. + */ +template +class trsm { + public: + using vec = typename std::conditional::value, vecFullFloat, vecFullDouble>::type; + static constexpr int64_t PacketSize = packet_traits::size; + + /*********************************** + * Auxillary Functions for: + * - loadRHS + * - storeRHS + * - divRHSByDiag + * - updateRHS + * - triSolveMicroKernel + ************************************/ + /** + * aux_loadRHS + * + * 2-D unroll + * for(startM = 0; startM < endM; startM++) + * for(startK = 0; startK < endK; startK++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadRHS( + Scalar *B_arr, int64_t LDB, PacketBlock &RHSInPacket, int64_t rem = 0) { + constexpr int64_t counterReverse = endM * endK - counter; + constexpr int64_t startM = counterReverse / (endK); + constexpr int64_t startK = counterReverse % endK; + + constexpr int64_t packetIndex = startM * endK + startK; + constexpr int64_t startM_ = isFWDSolve ? startM : -startM; + const int64_t rhsIndex = (startK * PacketSize) + startM_ * LDB; + EIGEN_IF_CONSTEXPR(krem) { + RHSInPacket.packet[packetIndex] = ploadu(&B_arr[rhsIndex], remMask(rem)); + } + else { + RHSInPacket.packet[packetIndex] = ploadu(&B_arr[rhsIndex]); + } + aux_loadRHS(B_arr, LDB, RHSInPacket, rem); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadRHS( + Scalar *B_arr, int64_t LDB, PacketBlock &RHSInPacket, int64_t rem = 0) { + EIGEN_UNUSED_VARIABLE(B_arr); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(RHSInPacket); + EIGEN_UNUSED_VARIABLE(rem); + } + + /** + * aux_storeRHS + * + * 2-D unroll + * for(startM = 0; startM < endM; startM++) + * for(startK = 0; startK < endK; startK++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeRHS( + Scalar *B_arr, int64_t LDB, PacketBlock &RHSInPacket, int64_t rem = 0) { + constexpr int64_t counterReverse = endM * endK - counter; + constexpr int64_t startM = counterReverse / (endK); + constexpr int64_t startK = counterReverse % endK; + + constexpr int64_t packetIndex = startM * endK + startK; + constexpr int64_t startM_ = isFWDSolve ? startM : -startM; + const int64_t rhsIndex = (startK * PacketSize) + startM_ * LDB; + EIGEN_IF_CONSTEXPR(krem) { + pstoreu(&B_arr[rhsIndex], RHSInPacket.packet[packetIndex], remMask(rem)); + } + else { + pstoreu(&B_arr[rhsIndex], RHSInPacket.packet[packetIndex]); + } + aux_storeRHS(B_arr, LDB, RHSInPacket, rem); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeRHS( + Scalar *B_arr, int64_t LDB, PacketBlock &RHSInPacket, int64_t rem = 0) { + EIGEN_UNUSED_VARIABLE(B_arr); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(RHSInPacket); + EIGEN_UNUSED_VARIABLE(rem); + } + + /** + * aux_divRHSByDiag + * + * currM may be -1, (currM >=0) in enable_if checks for this + * + * 1-D unroll + * for(startK = 0; startK < endK; startK++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && currM >= 0)> aux_divRHSByDiag( + PacketBlock &RHSInPacket, PacketBlock &AInPacket) { + constexpr int64_t counterReverse = endK - counter; + constexpr int64_t startK = counterReverse; + + constexpr int64_t packetIndex = currM * endK + startK; + RHSInPacket.packet[packetIndex] = pmul(AInPacket.packet[currM], RHSInPacket.packet[packetIndex]); + aux_divRHSByDiag(RHSInPacket, AInPacket); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t 0 && currM >= 0)> aux_divRHSByDiag( + PacketBlock &RHSInPacket, PacketBlock &AInPacket) { + EIGEN_UNUSED_VARIABLE(RHSInPacket); + EIGEN_UNUSED_VARIABLE(AInPacket); + } + + /** + * aux_updateRHS + * + * 2-D unroll + * for(startM = initM; startM < endM; startM++) + * for(startK = 0; startK < endK; startK++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_updateRHS( + Scalar *A_arr, int64_t LDA, PacketBlock &RHSInPacket, + PacketBlock &AInPacket) { + constexpr int64_t counterReverse = (endM - initM) * endK - counter; + constexpr int64_t startM = initM + counterReverse / (endK); + constexpr int64_t startK = counterReverse % endK; + + // For each row of A, first update all corresponding RHS + constexpr int64_t packetIndex = startM * endK + startK; + EIGEN_IF_CONSTEXPR(currentM > 0) { + RHSInPacket.packet[packetIndex] = + pnmadd(AInPacket.packet[startM], RHSInPacket.packet[(currentM - 1) * endK + startK], + RHSInPacket.packet[packetIndex]); + } + + EIGEN_IF_CONSTEXPR(startK == endK - 1) { + // Once all RHS for previous row of A is updated, we broadcast the next element in the column A_{i, currentM}. + EIGEN_IF_CONSTEXPR(startM == currentM && !isUnitDiag) { + // If diagonal is not unit, we broadcast reciprocals of diagonals AinPacket.packet[currentM]. + // This will be used in divRHSByDiag + EIGEN_IF_CONSTEXPR(isFWDSolve) + AInPacket.packet[currentM] = pset1(Scalar(1) / A_arr[idA(currentM, currentM, LDA)]); + else AInPacket.packet[currentM] = pset1(Scalar(1) / A_arr[idA(-currentM, -currentM, LDA)]); + } + else { + // Broadcast next off diagonal element of A + EIGEN_IF_CONSTEXPR(isFWDSolve) + AInPacket.packet[startM] = pset1(A_arr[idA(startM, currentM, LDA)]); + else AInPacket.packet[startM] = pset1(A_arr[idA(-startM, -currentM, LDA)]); + } + } + + aux_updateRHS( + A_arr, LDA, RHSInPacket, AInPacket); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_updateRHS( + Scalar *A_arr, int64_t LDA, PacketBlock &RHSInPacket, + PacketBlock &AInPacket) { + EIGEN_UNUSED_VARIABLE(A_arr); + EIGEN_UNUSED_VARIABLE(LDA); + EIGEN_UNUSED_VARIABLE(RHSInPacket); + EIGEN_UNUSED_VARIABLE(AInPacket); + } + + /** + * aux_triSolverMicroKernel + * + * 1-D unroll + * for(startM = 0; startM < endM; startM++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_triSolveMicroKernel( + Scalar *A_arr, int64_t LDA, PacketBlock &RHSInPacket, + PacketBlock &AInPacket) { + constexpr int64_t counterReverse = endM - counter; + constexpr int64_t startM = counterReverse; + + constexpr int64_t currentM = startM; + // Divides the right-hand side in row startM, by digonal value of A + // broadcasted to AInPacket.packet[startM-1] in the previous iteration. + // + // Without "if constexpr" the compiler instantiates the case <-1, numK> + // this is handled with enable_if to prevent out-of-bound warnings + // from the compiler + EIGEN_IF_CONSTEXPR(!isUnitDiag && startM > 0) + trsm::template divRHSByDiag(RHSInPacket, AInPacket); + + // After division, the rhs corresponding to subsequent rows of A can be partially updated + // We also broadcast the reciprocal of the next diagonal to AInPacket.packet[currentM] (if needed) + // to be used in the next iteration. + trsm::template updateRHS(A_arr, LDA, RHSInPacket, + AInPacket); + + // Handle division for the RHS corresponding to the final row of A. + EIGEN_IF_CONSTEXPR(!isUnitDiag && startM == endM - 1) + trsm::template divRHSByDiag(RHSInPacket, AInPacket); + + aux_triSolveMicroKernel(A_arr, LDA, RHSInPacket, + AInPacket); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_triSolveMicroKernel( + Scalar *A_arr, int64_t LDA, PacketBlock &RHSInPacket, + PacketBlock &AInPacket) { + EIGEN_UNUSED_VARIABLE(A_arr); + EIGEN_UNUSED_VARIABLE(LDA); + EIGEN_UNUSED_VARIABLE(RHSInPacket); + EIGEN_UNUSED_VARIABLE(AInPacket); + } + + /******************************************************** + * Wrappers for aux_XXXX to hide counter parameter + ********************************************************/ + + /** + * Load endMxendK block of B to RHSInPacket + * Masked loads are used for cases where endK is not a multiple of PacketSize + */ + template + static EIGEN_ALWAYS_INLINE void loadRHS(Scalar *B_arr, int64_t LDB, + PacketBlock &RHSInPacket, int64_t rem = 0) { + aux_loadRHS(B_arr, LDB, RHSInPacket, rem); + } + + /** + * Load endMxendK block of B to RHSInPacket + * Masked loads are used for cases where endK is not a multiple of PacketSize + */ + template + static EIGEN_ALWAYS_INLINE void storeRHS(Scalar *B_arr, int64_t LDB, + PacketBlock &RHSInPacket, int64_t rem = 0) { + aux_storeRHS(B_arr, LDB, RHSInPacket, rem); + } + + /** + * Only used if Triangular matrix has non-unit diagonal values + */ + template + static EIGEN_ALWAYS_INLINE void divRHSByDiag(PacketBlock &RHSInPacket, + PacketBlock &AInPacket) { + aux_divRHSByDiag(RHSInPacket, AInPacket); + } + + /** + * Update right-hand sides (stored in avx registers) + * Traversing along the column A_{i,currentM}, where currentM <= i <= endM, and broadcasting each value to AInPacket. + **/ + template + static EIGEN_ALWAYS_INLINE void updateRHS(Scalar *A_arr, int64_t LDA, + PacketBlock &RHSInPacket, + PacketBlock &AInPacket) { + aux_updateRHS( + A_arr, LDA, RHSInPacket, AInPacket); + } + + /** + * endM: dimension of A. 1 <= endM <= EIGEN_AVX_MAX_NUM_ROW + * numK: number of avx registers to use for each row of B (ex fp32: 48 rhs => 3 avx reg used). 1 <= endK <= 3. + * isFWDSolve: true => forward substitution, false => backwards substitution + * isUnitDiag: true => triangular matrix has unit diagonal. + */ + template + static EIGEN_ALWAYS_INLINE void triSolveMicroKernel(Scalar *A_arr, int64_t LDA, + PacketBlock &RHSInPacket, + PacketBlock &AInPacket) { + static_assert(numK >= 1 && numK <= 3, "numK out of range"); + aux_triSolveMicroKernel(A_arr, LDA, RHSInPacket, AInPacket); + } +}; + +/** + * Unrolls for gemm kernel + * + * isAdd: true => C += A*B, false => C -= A*B + */ +template +class gemm { + public: + using vec = typename std::conditional::value, vecFullFloat, vecFullDouble>::type; + static constexpr int64_t PacketSize = packet_traits::size; + + /*********************************** + * Auxillary Functions for: + * - setzero + * - updateC + * - storeC + * - startLoadB + * - triSolveMicroKernel + ************************************/ + + /** + * aux_setzero + * + * 2-D unroll + * for(startM = 0; startM < endM; startM++) + * for(startN = 0; startN < endN; startN++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_setzero( + PacketBlock &zmm) { + constexpr int64_t counterReverse = endM * endN - counter; + constexpr int64_t startM = counterReverse / (endN); + constexpr int64_t startN = counterReverse % endN; + + zmm.packet[startN * endM + startM] = pzero(zmm.packet[startN * endM + startM]); + aux_setzero(zmm); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_setzero( + PacketBlock &zmm) { + EIGEN_UNUSED_VARIABLE(zmm); + } + + /** + * aux_updateC + * + * 2-D unroll + * for(startM = 0; startM < endM; startM++) + * for(startN = 0; startN < endN; startN++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_updateC( + Scalar *C_arr, int64_t LDC, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + constexpr int64_t counterReverse = endM * endN - counter; + constexpr int64_t startM = counterReverse / (endN); + constexpr int64_t startN = counterReverse % endN; + + EIGEN_IF_CONSTEXPR(rem) + zmm.packet[startN * endM + startM] = + padd(ploadu(&C_arr[(startN)*LDC + startM * PacketSize], remMask(rem_)), + zmm.packet[startN * endM + startM], remMask(rem_)); + else zmm.packet[startN * endM + startM] = + padd(ploadu(&C_arr[(startN)*LDC + startM * PacketSize]), zmm.packet[startN * endM + startM]); + aux_updateC(C_arr, LDC, zmm, rem_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_updateC( + Scalar *C_arr, int64_t LDC, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(C_arr); + EIGEN_UNUSED_VARIABLE(LDC); + EIGEN_UNUSED_VARIABLE(zmm); + EIGEN_UNUSED_VARIABLE(rem_); + } + + /** + * aux_storeC + * + * 2-D unroll + * for(startM = 0; startM < endM; startM++) + * for(startN = 0; startN < endN; startN++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeC( + Scalar *C_arr, int64_t LDC, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + constexpr int64_t counterReverse = endM * endN - counter; + constexpr int64_t startM = counterReverse / (endN); + constexpr int64_t startN = counterReverse % endN; + + EIGEN_IF_CONSTEXPR(rem) + pstoreu(&C_arr[(startN)*LDC + startM * PacketSize], zmm.packet[startN * endM + startM], + remMask(rem_)); + else pstoreu(&C_arr[(startN)*LDC + startM * PacketSize], zmm.packet[startN * endM + startM]); + aux_storeC(C_arr, LDC, zmm, rem_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeC( + Scalar *C_arr, int64_t LDC, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(C_arr); + EIGEN_UNUSED_VARIABLE(LDC); + EIGEN_UNUSED_VARIABLE(zmm); + EIGEN_UNUSED_VARIABLE(rem_); + } + + /** + * aux_startLoadB + * + * 1-D unroll + * for(startL = 0; startL < endL; startL++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startLoadB( + Scalar *B_t, int64_t LDB, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + constexpr int64_t counterReverse = endL - counter; + constexpr int64_t startL = counterReverse; + + EIGEN_IF_CONSTEXPR(rem) + zmm.packet[unrollM * unrollN + startL] = + ploadu(&B_t[(startL / unrollM) * LDB + (startL % unrollM) * PacketSize], remMask(rem_)); + else zmm.packet[unrollM * unrollN + startL] = + ploadu(&B_t[(startL / unrollM) * LDB + (startL % unrollM) * PacketSize]); + + aux_startLoadB(B_t, LDB, zmm, rem_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_startLoadB( + Scalar *B_t, int64_t LDB, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(B_t); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(zmm); + EIGEN_UNUSED_VARIABLE(rem_); + } + + /** + * aux_startBCastA + * + * 1-D unroll + * for(startB = 0; startB < endB; startB++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startBCastA( + Scalar *A_t, int64_t LDA, PacketBlock &zmm) { + constexpr int64_t counterReverse = endB - counter; + constexpr int64_t startB = counterReverse; + + zmm.packet[unrollM * unrollN + numLoad + startB] = pload1(&A_t[idA(startB, 0, LDA)]); + + aux_startBCastA(A_t, LDA, zmm); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_startBCastA( + Scalar *A_t, int64_t LDA, PacketBlock &zmm) { + EIGEN_UNUSED_VARIABLE(A_t); + EIGEN_UNUSED_VARIABLE(LDA); + EIGEN_UNUSED_VARIABLE(zmm); + } + + /** + * aux_loadB + * currK: current K + * + * 1-D unroll + * for(startM = 0; startM < endM; startM++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB( + Scalar *B_t, int64_t LDB, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + if ((numLoad / endM + currK < unrollK)) { + constexpr int64_t counterReverse = endM - counter; + constexpr int64_t startM = counterReverse; + + EIGEN_IF_CONSTEXPR(rem) { + zmm.packet[endM * unrollN + (startM + currK * endM) % numLoad] = + ploadu(&B_t[(numLoad / endM + currK) * LDB + startM * PacketSize], remMask(rem_)); + } + else { + zmm.packet[endM * unrollN + (startM + currK * endM) % numLoad] = + ploadu(&B_t[(numLoad / endM + currK) * LDB + startM * PacketSize]); + } + + aux_loadB(B_t, LDB, zmm, rem_); + } + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadB( + Scalar *B_t, int64_t LDB, PacketBlock &zmm, int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(B_t); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(zmm); + EIGEN_UNUSED_VARIABLE(rem_); + } + + /** + * aux_microKernel + * + * 3-D unroll + * for(startM = 0; startM < endM; startM++) + * for(startN = 0; startN < endN; startN++) + * for(startK = 0; startK < endK; startK++) + **/ + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_microKernel( + Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock &zmm, + int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + constexpr int64_t counterReverse = endM * endN * endK - counter; + constexpr int startK = counterReverse / (endM * endN); + constexpr int startN = (counterReverse / (endM)) % endN; + constexpr int startM = counterReverse % endM; + + EIGEN_IF_CONSTEXPR(startK == 0 && startM == 0 && startN == 0) { + gemm::template startLoadB(B_t, LDB, zmm, rem_); + gemm::template startBCastA(A_t, LDA, zmm); + } + + { + // Interleave FMA and Bcast + EIGEN_IF_CONSTEXPR(isAdd) { + zmm.packet[startN * endM + startM] = + pmadd(zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast], + zmm.packet[endM * endN + (startM + startK * endM) % numLoad], zmm.packet[startN * endM + startM]); + } + else { + zmm.packet[startN * endM + startM] = + pnmadd(zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast], + zmm.packet[endM * endN + (startM + startK * endM) % numLoad], zmm.packet[startN * endM + startM]); + } + // Bcast + EIGEN_IF_CONSTEXPR(startM == endM - 1 && (numBCast + startN + startK * endN < endK * endN)) { + zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast] = pload1(&A_t[idA( + (numBCast + startN + startK * endN) % endN, (numBCast + startN + startK * endN) / endN, LDA)]); + } + } + + // We have updated all accumlators, time to load next set of B's + EIGEN_IF_CONSTEXPR((startN == endN - 1) && (startM == endM - 1)) { + gemm::template loadB(B_t, LDB, zmm, rem_); + } + aux_microKernel(B_t, A_t, LDB, LDA, zmm, rem_); + } + + template + static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_microKernel( + Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock &zmm, + int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(B_t); + EIGEN_UNUSED_VARIABLE(A_t); + EIGEN_UNUSED_VARIABLE(LDB); + EIGEN_UNUSED_VARIABLE(LDA); + EIGEN_UNUSED_VARIABLE(zmm); + EIGEN_UNUSED_VARIABLE(rem_); + } + + /******************************************************** + * Wrappers for aux_XXXX to hide counter parameter + ********************************************************/ + + template + static EIGEN_ALWAYS_INLINE void setzero(PacketBlock &zmm) { + aux_setzero(zmm); + } + + /** + * Ideally the compiler folds these into vaddp{s,d} with an embedded memory load. + */ + template + static EIGEN_ALWAYS_INLINE void updateC(Scalar *C_arr, int64_t LDC, + PacketBlock &zmm, + int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + aux_updateC(C_arr, LDC, zmm, rem_); + } + + template + static EIGEN_ALWAYS_INLINE void storeC(Scalar *C_arr, int64_t LDC, + PacketBlock &zmm, + int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + aux_storeC(C_arr, LDC, zmm, rem_); + } + + /** + * Use numLoad registers for loading B at start of microKernel + */ + template + static EIGEN_ALWAYS_INLINE void startLoadB(Scalar *B_t, int64_t LDB, + PacketBlock &zmm, + int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + aux_startLoadB(B_t, LDB, zmm, rem_); + } + + /** + * Use numBCast registers for broadcasting A at start of microKernel + */ + template + static EIGEN_ALWAYS_INLINE void startBCastA(Scalar *A_t, int64_t LDA, + PacketBlock &zmm) { + aux_startBCastA(A_t, LDA, zmm); + } + + /** + * Loads next set of B into vector registers between each K unroll. + */ + template + static EIGEN_ALWAYS_INLINE void loadB(Scalar *B_t, int64_t LDB, + PacketBlock &zmm, + int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + aux_loadB(B_t, LDB, zmm, rem_); + } + + /** + * Generates a microkernel for gemm (row-major) with unrolls {1,2,4,8}x{U1,U2,U3} to compute C -= A*B. + * A matrix can be row/col-major. B matrix is assumed row-major. + * + * isARowMajor: is A row major + * endM: Number registers per row + * endN: Number of rows + * endK: Loop unroll for K. + * numLoad: Number of registers for loading B. + * numBCast: Number of registers for broadcasting A. + * + * Ex: microkernel: 8x48 unroll (24 accumulators), k unrolled 4 times, + * 6 register for loading B, 2 for broadcasting A. + * + * Note: Ideally the microkernel should not have any register spilling. + * The avx instruction counts should be: + * - endK*endN vbroadcasts{s,d} + * - endK*endM vmovup{s,d} + * - endK*endN*endM FMAs + * + * From testing, there are no register spills with clang. There are register spills with GNU, which + * causes a performance hit. + */ + template + static EIGEN_ALWAYS_INLINE void microKernel(Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, + PacketBlock &zmm, + int64_t rem_ = 0) { + EIGEN_UNUSED_VARIABLE(rem_); + aux_microKernel(B_t, A_t, LDB, LDA, zmm, + rem_); + } +}; +} // namespace unrolls + +#endif // EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H diff --git a/libs/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h index 3304127..62a7429 100644 --- a/libs/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TYPE_CASTING_AVX512_H #define EIGEN_TYPE_CASTING_AVX512_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -30,6 +32,56 @@ template<> EIGEN_STRONG_INLINE Packet16f preinterpret(cons return _mm512_castsi512_ps(a); } +template<> EIGEN_STRONG_INLINE Packet8d preinterpret(const Packet16f& a) { + return _mm512_castps_pd(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet8d& a) { + return _mm512_castpd_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Packet16f& a) { + return _mm512_castps512_ps256(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet16f& a) { + return _mm512_castps512_ps128(a); +} + +template<> EIGEN_STRONG_INLINE Packet4d preinterpret(const Packet8d& a) { + return _mm512_castpd512_pd256(a); +} + +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet8d& a) { + return _mm512_castpd512_pd128(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet8f& a) { + return _mm512_castps256_ps512(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet4f& a) { + return _mm512_castps128_ps512(a); +} + +template<> EIGEN_STRONG_INLINE Packet8d preinterpret(const Packet4d& a) { + return _mm512_castpd256_pd512(a); +} + +template<> EIGEN_STRONG_INLINE Packet8d preinterpret(const Packet2d& a) { + return _mm512_castpd128_pd512(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet16f& a) { + return a; +} + +template<> EIGEN_STRONG_INLINE Packet8d preinterpret(const Packet8d& a) { + return a; +} + +#ifndef EIGEN_VECTORIZE_AVX512FP16 + template <> struct type_casting_traits { enum { @@ -56,6 +108,8 @@ template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packe return float2half(a); } +#endif + template <> struct type_casting_traits { enum { @@ -82,6 +136,77 @@ template<> EIGEN_STRONG_INLINE Packet16bf pcast(const Pac return F32ToBf16(a); } +#ifdef EIGEN_VECTORIZE_AVX512FP16 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template <> +EIGEN_STRONG_INLINE Packet16f pcast(const Packet32h& a) { + // Discard second-half of input. + Packet16h low = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0)); + return _mm512_cvtxph_ps(_mm256_castsi256_ph(low)); +} + + +template <> +EIGEN_STRONG_INLINE Packet32h pcast(const Packet16f& a, const Packet16f& b) { + __m512d result = _mm512_undefined_pd(); + result = _mm512_insertf64x4(result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 0); + result = _mm512_insertf64x4(result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 1); + return _mm512_castpd_ph(result); +} + +template <> +EIGEN_STRONG_INLINE Packet8f pcast(const Packet16h& a) { + // Discard second-half of input. + Packet8h low = _mm_castps_si128(_mm256_extractf32x4_ps(_mm256_castsi256_ps(a), 0)); + return _mm256_cvtxph_ps(_mm_castsi128_ph(low)); +} + + +template <> +EIGEN_STRONG_INLINE Packet16h pcast(const Packet8f& a, const Packet8f& b) { + __m256d result = _mm256_undefined_pd(); + result = _mm256_insertf64x2(result, _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 0); + result = _mm256_insertf64x2(result, _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 1); + return _mm256_castpd_si256(result); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8h& a) { + Packet8f full = _mm256_cvtxph_ps(_mm_castsi128_ph(a)); + // Discard second-half of input. + return _mm256_extractf32x4_ps(full, 0); +} + + +template <> +EIGEN_STRONG_INLINE Packet8h pcast(const Packet4f& a, const Packet4f& b) { + __m256 result = _mm256_undefined_ps(); + result = _mm256_insertf128_ps(result, a, 0); + result = _mm256_insertf128_ps(result, b, 1); + return _mm256_cvtps_ph(result, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); +} + + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/Complex.h index f424f11..46812f9 100644 --- a/libs/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/Complex.h @@ -11,6 +11,8 @@ #ifndef EIGEN_COMPLEX32_ALTIVEC_H #define EIGEN_COMPLEX32_ALTIVEC_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -100,6 +102,7 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, + HasSqrt = 1, #ifdef __VSX__ HasBlend = 1, #endif @@ -112,53 +115,99 @@ template<> struct unpacket_traits { typedef std::complex type; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { Packet2cf res; +#ifdef __VSX__ + // Load a single std::complex from memory and duplicate + // + // Using pload would read past the end of the reference in this case + // Using vec_xl_len + vec_splat, generates poor assembly + __asm__ ("lxvdsx %x0,%y1" : "=wa" (res.v) : "Z" (from)); +#else if((std::ptrdiff_t(&from) % 16) == 0) res.v = pload((const float *)&from); else res.v = ploadu((const float *)&from); res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); +#endif return res; } template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { return Packet2cf(pload((const float *) from)); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { return Packet2cf(ploadu((const float*) from)); } +template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial(const std::complex* from, const Index n, const Index offset) +{ + return Packet2cf(pload_partial((const float *) from, n * 2, offset * 2)); +} +template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial(const std::complex* from, const Index n) +{ + return Packet2cf(ploadu_partial((const float*) from, n * 2)); +} template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } +template<> EIGEN_ALWAYS_INLINE void pstore_partial >(std::complex * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet2cf& from, const Index n) { pstoreu_partial((float*)to, from.v, n * 2); } -EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex* from0, const std::complex* from1) +EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std::complex& from1) { Packet4f res0, res1; #ifdef __VSX__ - __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0)); - __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1)); + // Load two std::complex from memory and combine + __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1)); #ifdef _BIG_ENDIAN __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #else __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #endif #else - *reinterpret_cast *>(&res0) = *from0; - *reinterpret_cast *>(&res1) = *from1; + *reinterpret_cast *>(&res0) = from0; + *reinterpret_cast *>(&res1) = from1; res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI); #endif return Packet2cf(res0); } -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore(const std::complex* from) { - EIGEN_ALIGN16 std::complex af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); + Packet2cf res; + res.v = pload_ignore(reinterpret_cast(from)); + return res; } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2) { - EIGEN_ALIGN16 std::complex af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; + eigen_assert(n <= unpacket_traits::size && "number of elements will gather past end of packet"); + EIGEN_ALIGN16 Scalar af[2]; + for (Index i = 0; i < n; i++) { + af[i] = from[i*stride]; + } + return pload_ignore(af); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + return pgather_complex_size2, Packet2cf>(from, stride); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial, Packet2cf>(const std::complex* from, Index stride, const Index n) +{ + return pgather_complex_size2, Packet2cf>(from, stride, n); +} +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2) +{ + eigen_assert(n <= unpacket_traits::size && "number of elements will scatter past end of packet"); + EIGEN_ALIGN16 Scalar af[2]; + pstore((Scalar *) af, from); + for (Index i = 0; i < n; i++) { + to[i*stride] = af[i]; + } +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + pscatter_complex_size2, Packet2cf>(to, from, stride); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride, const Index n) +{ + pscatter_complex_size2, Packet2cf>(to, from, stride, n); } template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } @@ -184,7 +233,7 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pack template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { Packet4f rev_a; - rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); + rev_a = vec_sld(a.v, a.v, 8); return Packet2cf(rev_a); } @@ -210,10 +259,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec - Packet2cf res = pmul(a, pconj(b)); - Packet4f s = pmul(b.v, b.v); - return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) @@ -223,8 +269,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +#ifdef __VSX__ + Packet4f tmp = reinterpret_cast(vec_mergeh(reinterpret_cast(kernel.packet[0].v), reinterpret_cast(kernel.packet[1].v))); + kernel.packet[1].v = reinterpret_cast(vec_mergel(reinterpret_cast(kernel.packet[0].v), reinterpret_cast(kernel.packet[1].v))); +#else Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); +#endif kernel.packet[0].v = tmp; } @@ -320,6 +371,7 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, + HasSqrt = 1, HasSetLinear = 0 }; }; @@ -328,17 +380,35 @@ template<> struct unpacket_traits { typedef std::complex type template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial(const std::complex* from, const Index n, const Index offset) +{ + return Packet1cd(pload_partial((const double*)from, n * 2, offset * 2)); +} +template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial(const std::complex* from, const Index n) +{ + return Packet1cd(ploadu_partial((const double*)from, n * 2)); +} template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } +template<> EIGEN_ALWAYS_INLINE void pstore_partial >(std::complex * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial >(std::complex * to, const Packet1cd& from, const Index n) { pstoreu_partial((double*)to, from.v, n * 2); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather, Packet1cd>(const std::complex* from, Index) { return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial, Packet1cd>(const std::complex* from, Index, const Index) +{ + return pload(from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index) +{ + pstore >(to, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial, Packet1cd>(std::complex* to, const Packet1cd& from, Index, const Index) { pstore >(to, from); } @@ -359,7 +429,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - EIGEN_ALIGN16 std::complex res[2]; + EIGEN_ALIGN16 std::complex res[1]; pstore >(res, a); return res[0]; @@ -375,10 +445,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for AltiVec - Packet1cd res = pmul(a,pconj(b)); - Packet2d s = pmul(b.v, b.v); - return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) @@ -388,8 +455,8 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + Packet2d tmp = vec_mergeh(kernel.packet[0].v, kernel.packet[1].v); + kernel.packet[1].v = vec_mergel(kernel.packet[0].v, kernel.packet[1].v); kernel.packet[0].v = tmp; } diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 3a7a329..6f48d98 100644 --- a/libs/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -12,73 +12,117 @@ #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog(const Packet4f& _x) { return plog_float(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp(const Packet4f& _x) { return pexp_float(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psin(const Packet4f& _x) { return psin_float(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos(const Packet4f& _x) { return pcos_float(_x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f pacos(const Packet4f& _x) +{ + return pacos_float(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f pasin(const Packet4f& _x) +{ + return pasin_float(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f patan(const Packet4f& _x) +{ + return patan_float(_x); +} + +#ifdef __VSX__ #ifndef EIGEN_COMP_CLANG -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt(const Packet4f& x) { return vec_rsqrt(x); } -#endif -#ifdef __VSX__ -#ifndef EIGEN_COMP_CLANG -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt(const Packet2d& x) { return vec_rsqrt(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet2d patan(const Packet2d& _x) +{ + return patan_double(_x); +} #endif -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) { return vec_sqrt(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt(const Packet2d& x) { return vec_sqrt(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& _x) { return pexp_double(_x); } + +template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(psqrt, a); +} + +#ifndef EIGEN_COMP_CLANG +template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); +} +#endif +#else +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f psqrt(const Packet4f& x) +{ + Packet4f a; + for (Index i = 0; i < packet_traits::size; i++) { + a[i] = numext::sqrt(x[i]); + } + return a; +} #endif // Hyperbolic Tangent function. template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh(const Packet4f& x) { return internal::generic_fast_tanh_float(x); } diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 3f79b97..2429c81 100644 --- a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -17,24 +17,35 @@ #include "MatrixProductCommon.h" -// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX -#if EIGEN_COMP_LLVM -#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) -#ifdef __MMA__ -#define EIGEN_ALTIVEC_MMA_ONLY -#else -#define EIGEN_ALTIVEC_DISABLE_MMA -#endif -#endif +#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) +#define EIGEN_ALTIVEC_DISABLE_MMA 0 #endif -#ifdef __has_builtin +// Check for MMA builtin support. +#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin) #if __has_builtin(__builtin_mma_assemble_acc) - #define ALTIVEC_MMA_SUPPORT + #define EIGEN_ALTIVEC_MMA_SUPPORT #endif #endif -#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) +// Check if and how we should actually use MMA if supported. +#if defined(EIGEN_ALTIVEC_MMA_SUPPORT) + +#if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH) +#define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0 +#endif + +// Check if we want to enable dynamic dispatch. Not supported by LLVM. +#if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM +#define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1 +// Otherwise, use MMA by default if available. +#elif defined(__MMA__) +#define EIGEN_ALTIVEC_MMA_ONLY 1 +#endif + +#endif // EIGEN_ALTIVEC_MMA_SUPPORT + +#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) #include "MatrixProductMMA.h" #endif @@ -43,6 +54,8 @@ * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). * * - Check the possibility of transposing as GETREAL and GETIMAG when needed. * **************************************************************************************************/ +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -78,6 +91,20 @@ struct quad_traits }; }; +template<> +struct quad_traits +{ + typedef Packet8bf vectortype; + typedef PacketBlock type; + typedef vectortype rhstype; + enum + { + vectorsize = packet_traits::size, + size = 8, + rows = 4 + }; +}; + // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then // are responsible to extract from convert between Eigen's and MatrixProduct approach. @@ -91,12 +118,6 @@ const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; -const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7, - 16, 17, 18, 19, 20, 21, 22, 23}; - -//[a,ai],[b,bi] = [ai,bi] -const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, - 24, 25, 26, 27, 28, 29, 30, 31}; /********************************************* * Single precision real and complex packing * @@ -116,7 +137,7 @@ const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, * reason why packing for complex is broken down into several different parts, also the reason why we endup having a * float32/64 and complex float32/64 version. **/ -template +template EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) { std::complex v; @@ -135,7 +156,7 @@ EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_b return v; } -template +template EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { const Index depth = k2 + rows; @@ -153,7 +174,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc { for(Index k = 0; k < vectorSize; k++) { - std::complex v = getAdjointVal(i, j + k, rhs); + std::complex v = getAdjointVal(i, j + k, rhs); blockBf[rir + k] = v.real(); blockBf[rii + k] = v.imag(); @@ -164,35 +185,34 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc rir += vectorDelta; } - if (j < cols) + + for(; j < cols; j++) { - rii = rir + ((cols - j) * rows); + rii = rir + rows; for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - std::complex v = getAdjointVal(i, k, rhs); + std::complex v = getAdjointVal(i, j, rhs); - blockBf[rir] = v.real(); - blockBf[rii] = v.imag(); + blockBf[rir] = v.real(); + blockBf[rii] = v.imag(); - rir += 1; - rii += 1; - } + rir += 1; + rii += 1; } + + rir += rows; } } -template +template EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { const Index depth = cols; const_blas_data_mapper, Index, StorageOrder> lhs(_lhs, lhsStride); const Index vectorSize = quad_traits::vectorsize; const Index vectorDelta = vectorSize * depth; - Scalar* blockAf = (Scalar *)(blockA); + Scalar* blockAf = reinterpret_cast(blockA); Index rir = 0, rii, j = 0; for(; j + vectorSize <= rows; j+=vectorSize) @@ -203,7 +223,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* bloc { for(Index k = 0; k < vectorSize; k++) { - std::complex v = getAdjointVal(j+k, i, lhs); + std::complex v = getAdjointVal(j+k, i, lhs); blockAf[rir + k] = v.real(); blockAf[rii + k] = v.imag(); @@ -224,7 +244,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* bloc Index k = j; for(; k < rows; k++) { - std::complex v = getAdjointVal(k, i, lhs); + std::complex v = getAdjointVal(k, i, lhs); blockAf[rir] = v.real(); blockAf[rii] = v.imag(); @@ -236,7 +256,7 @@ EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* bloc } } -template +template EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { const Index depth = k2 + rows; @@ -260,24 +280,20 @@ EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs } } - if (j < cols) + for(; j < cols; j++) { for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - if(k <= i) - blockB[ri] = rhs(i, k); - else - blockB[ri] = rhs(k, i); - ri += 1; - } + if(j <= i) + blockB[ri] = rhs(i, j); + else + blockB[ri] = rhs(j, i); + ri += 1; } } } -template +template EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { const Index depth = cols; @@ -324,7 +340,7 @@ struct symm_pack_rhs, Index, nr, StorageOrder> { void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -333,7 +349,7 @@ struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrde { void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -344,7 +360,7 @@ struct symm_pack_rhs, Index, nr, StorageOrder> { void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -353,7 +369,7 @@ struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrd { void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -363,7 +379,7 @@ struct symm_pack_rhs { void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -372,7 +388,7 @@ struct symm_pack_lhs { void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -382,7 +398,7 @@ struct symm_pack_rhs { void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { - symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); } }; @@ -391,7 +407,7 @@ struct symm_pack_lhs { void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) { - symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); } }; @@ -406,26 +422,22 @@ struct symm_pack_lhs * and offset and behaves accordingly. **/ -template -EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) -{ - const Index size = 16 / sizeof(Scalar); - pstore(to + (0 * size), block.packet[0]); - pstore(to + (1 * size), block.packet[1]); - pstore(to + (2 * size), block.packet[2]); - pstore(to + (3 * size), block.packet[3]); -} - -template -EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); pstore(to + (1 * size), block.packet[1]); + if (N > 2) { + pstore(to + (2 * size), block.packet[2]); + } + if (N > 3) { + pstore(to + (3 * size), block.packet[3]); + } } // General template for lhs & rhs complex packing. -template +template struct dhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -437,6 +449,7 @@ struct dhs_cpack { for(; j + vectorSize <= rows; j+=vectorSize) { + const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j); Index i = 0; rii = rir + vectorDelta; @@ -447,9 +460,9 @@ struct dhs_cpack { PacketBlock cblock; if (UseLhs) { - bload(cblock, lhs, j, i); + bload(cblock, lhs2, 0, i); } else { - bload(cblock, lhs, i, j); + bload(cblock, lhs2, i, 0); } blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); @@ -476,8 +489,8 @@ struct dhs_cpack { ptranspose(blocki); } - storeBlock(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 4*vectorSize; rii += 4*vectorSize; @@ -490,28 +503,19 @@ struct dhs_cpack { if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs))) { if (UseLhs) { - cblock.packet[0] = lhs.template loadPacket(j + 0, i); - cblock.packet[1] = lhs.template loadPacket(j + 2, i); + cblock.packet[0] = lhs2.template loadPacket(0, i); + cblock.packet[1] = lhs2.template loadPacket(2, i); } else { - cblock.packet[0] = lhs.template loadPacket(i, j + 0); - cblock.packet[1] = lhs.template loadPacket(i, j + 2); + cblock.packet[0] = lhs2.template loadPacket(i, 0); + cblock.packet[1] = lhs2.template loadPacket(i, 2); } } else { - std::complex lhs0, lhs1; if (UseLhs) { - lhs0 = lhs(j + 0, i); - lhs1 = lhs(j + 1, i); - cblock.packet[0] = pload2(&lhs0, &lhs1); - lhs0 = lhs(j + 2, i); - lhs1 = lhs(j + 3, i); - cblock.packet[1] = pload2(&lhs0, &lhs1); + cblock.packet[0] = pload2(lhs2(0, i), lhs2(1, i)); + cblock.packet[1] = pload2(lhs2(2, i), lhs2(3, i)); } else { - lhs0 = lhs(i, j + 0); - lhs1 = lhs(i, j + 1); - cblock.packet[0] = pload2(&lhs0, &lhs1); - lhs0 = lhs(i, j + 2); - lhs1 = lhs(i, j + 3); - cblock.packet[1] = pload2(&lhs0, &lhs1); + cblock.packet[0] = pload2(lhs2(i, 0), lhs2(i, 1)); + cblock.packet[1] = pload2(lhs2(i, 2), lhs2(i, 3)); } } @@ -533,34 +537,51 @@ struct dhs_cpack { rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) rir += (offset*(rows - j - vectorSize)); - rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + if(PanelMode) rir -= (offset*(vectorSize - 1)); - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + const DataMapper lhs2 = lhs.getSubMapper(0, j); + rii = rir + ((PanelMode) ? stride : depth); + + for(Index i = 0; i < depth; i++) { - if (UseLhs) { + blockAt[rir] = lhs2(i, 0).real(); + + if(Conjugate) + blockAt[rii] = -lhs2(i, 0).imag(); + else + blockAt[rii] = lhs2(i, 0).imag(); + + rir += 1; + rii += 1; + } + + rir += ((PanelMode) ? (2*stride - depth) : depth); + } + } else { + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { blockAt[rir] = lhs(k, i).real(); if(Conjugate) blockAt[rii] = -lhs(k, i).imag(); else blockAt[rii] = lhs(k, i).imag(); - } else { - blockAt[rir] = lhs(i, k).real(); - if(Conjugate) - blockAt[rii] = -lhs(i, k).imag(); - else - blockAt[rii] = lhs(i, k).imag(); + rir += 1; + rii += 1; } - - rir += 1; - rii += 1; } } } @@ -568,7 +589,7 @@ struct dhs_cpack { }; // General template for lhs & rhs packing. -template +template struct dhs_pack{ EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -577,6 +598,7 @@ struct dhs_pack{ for(; j + vectorSize <= rows; j+=vectorSize) { + const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j); Index i = 0; if(PanelMode) ri += vectorSize*offset; @@ -586,16 +608,16 @@ struct dhs_pack{ PacketBlock block; if (UseLhs) { - bload(block, lhs, j, i); + bload(block, lhs2, 0, i); } else { - bload(block, lhs, i, j); + bload(block, lhs2, i, 0); } if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) { ptranspose(block); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 4*vectorSize; } @@ -604,22 +626,22 @@ struct dhs_pack{ if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) { if (UseLhs) { - blockA[ri+0] = lhs(j+0, i); - blockA[ri+1] = lhs(j+1, i); - blockA[ri+2] = lhs(j+2, i); - blockA[ri+3] = lhs(j+3, i); + blockA[ri+0] = lhs2(0, i); + blockA[ri+1] = lhs2(1, i); + blockA[ri+2] = lhs2(2, i); + blockA[ri+3] = lhs2(3, i); } else { - blockA[ri+0] = lhs(i, j+0); - blockA[ri+1] = lhs(i, j+1); - blockA[ri+2] = lhs(i, j+2); - blockA[ri+3] = lhs(i, j+3); + blockA[ri+0] = lhs2(i, 0); + blockA[ri+1] = lhs2(i, 1); + blockA[ri+2] = lhs2(i, 2); + blockA[ri+3] = lhs2(i, 3); } } else { Packet lhsV; if (UseLhs) { - lhsV = lhs.template loadPacket(j, i); + lhsV = lhs2.template loadPacket(0, i); } else { - lhsV = lhs.template loadPacket(i, j); + lhsV = lhs2.template loadPacket(i, 0); } pstore(blockA + ri, lhsV); } @@ -630,30 +652,43 @@ struct dhs_pack{ if(PanelMode) ri += vectorSize*(stride - offset - depth); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) ri += offset*(rows - j); + if(PanelMode) ri += offset; - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + const DataMapper lhs2 = lhs.getSubMapper(0, j); + for(Index i = 0; i < depth; i++) { - if (UseLhs) { - blockA[ri] = lhs(k, i); - } else { - blockA[ri] = lhs(i, k); - } + blockA[ri] = lhs2(i, 0); ri += 1; } + + if(PanelMode) ri += stride - depth; + } + } else { + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + blockA[ri] = lhs(k, i); + ri += 1; + } + } } } } }; // General template for lhs packing, float64 specialization. -template -struct dhs_pack +template +struct dhs_pack { EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -662,6 +697,7 @@ struct dhs_pack block; if(StorageOrder == RowMajor) { - block.packet[0] = lhs.template loadPacket(j + 0, i); - block.packet[1] = lhs.template loadPacket(j + 1, i); + block.packet[0] = lhs2.template loadPacket(0, i); + block.packet[1] = lhs2.template loadPacket(1, i); ptranspose(block); } else { - block.packet[0] = lhs.template loadPacket(j, i + 0); - block.packet[1] = lhs.template loadPacket(j, i + 1); + block.packet[0] = lhs2.template loadPacket(0, i + 0); + block.packet[1] = lhs2.template loadPacket(0, i + 1); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 2*vectorSize; } @@ -688,10 +724,10 @@ struct dhs_pack(j, i); + Packet2d lhsV = lhs2.template loadPacket(0, i); pstore(blockA + ri, lhsV); } @@ -719,8 +755,8 @@ struct dhs_pack -struct dhs_pack +template +struct dhs_pack { EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { @@ -729,6 +765,7 @@ struct dhs_pack block1, block2; - block1.packet[0] = rhs.template loadPacket(i, j + 0); - block1.packet[1] = rhs.template loadPacket(i, j + 1); - block2.packet[0] = rhs.template loadPacket(i, j + 2); - block2.packet[1] = rhs.template loadPacket(i, j + 3); + block1.packet[0] = rhs2.template loadPacket(i, 0); + block1.packet[1] = rhs2.template loadPacket(i, 1); + block2.packet[0] = rhs2.template loadPacket(i, 2); + block2.packet[1] = rhs2.template loadPacket(i, 3); ptranspose(block1); ptranspose(block2); @@ -752,12 +789,12 @@ struct dhs_pack(blockB + ri + 4, block1.packet[1]); pstore(blockB + ri + 6, block2.packet[1]); } else { - block.packet[0] = rhs.template loadPacket(i + 0, j + 0); //[a1 a2] - block.packet[1] = rhs.template loadPacket(i + 0, j + 2); //[a3 a4] - block.packet[2] = rhs.template loadPacket(i + 1, j + 0); //[b1 b2] - block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] + block.packet[0] = rhs2.template loadPacket(i + 0, 0); //[a1 a2] + block.packet[1] = rhs2.template loadPacket(i + 0, 2); //[a3 a4] + block.packet[2] = rhs2.template loadPacket(i + 1, 0); //[b1 b2] + block.packet[3] = rhs2.template loadPacket(i + 1, 2); //[b3 b4] - storeBlock(blockB + ri, block); + storeBlock(blockB + ri, block); } ri += 4*vectorSize; @@ -766,20 +803,20 @@ struct dhs_pack(i, j); + Packet2d rhsV = rhs2.template loadPacket(i, 0); pstore(blockB + ri, rhsV); ri += vectorSize; - rhsV = rhs.template loadPacket(i, j + 2); + rhsV = rhs2.template loadPacket(i, 2); pstore(blockB + ri, rhsV); } ri += vectorSize; @@ -788,26 +825,25 @@ struct dhs_pack -struct dhs_cpack +template +struct dhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { @@ -819,6 +855,7 @@ struct dhs_cpack(j, i + 0); //[a1 a1i] - cblock.packet[1] = lhs.template loadPacket(j, i + 1); //[b1 b1i] + cblock.packet[0] = lhs2.template loadPacket(0, i + 0); //[a1 a1i] + cblock.packet[1] = lhs2.template loadPacket(0, i + 1); //[b1 b1i] - cblock.packet[2] = lhs.template loadPacket(j + 1, i + 0); //[a2 a2i] - cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i] + cblock.packet[2] = lhs2.template loadPacket(1, i + 0); //[a2 a2i] + cblock.packet[3] = lhs2.template loadPacket(1, i + 1); //[b2 b2i] - blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2] - blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] + blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2] + blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2] - blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64); - blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64); + blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v); + blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v); } else { - cblock.packet[0] = lhs.template loadPacket(j + 0, i); //[a1 a1i] - cblock.packet[1] = lhs.template loadPacket(j + 1, i); //[a2 a2i] + cblock.packet[0] = lhs2.template loadPacket(0, i); //[a1 a1i] + cblock.packet[1] = lhs2.template loadPacket(1, i); //[a2 a2i] - cblock.packet[2] = lhs.template loadPacket(j + 0, i + 1); //[b1 b1i] - cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i + cblock.packet[2] = lhs2.template loadPacket(0, i + 1); //[b1 b1i] + cblock.packet[3] = lhs2.template loadPacket(1, i + 1); //[b2 b2i - blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2] - blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] + blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2] + blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2] - blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); - blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); + blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v); + blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v); } if(Conjugate) @@ -861,8 +898,8 @@ struct dhs_cpack(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -872,11 +909,11 @@ struct dhs_cpack blockr, blocki; PacketBlock cblock; - cblock.packet[0] = lhs.template loadPacket(j + 0, i); - cblock.packet[1] = lhs.template loadPacket(j + 1, i); + cblock.packet[0] = lhs2.template loadPacket(0, i); + cblock.packet[1] = lhs2.template loadPacket(1, i); - blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); - blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); + blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); + blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v); if(Conjugate) { @@ -919,8 +956,8 @@ struct dhs_cpack -struct dhs_cpack +template +struct dhs_cpack { EIGEN_STRONG_INLINE void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { @@ -932,6 +969,7 @@ struct dhs_cpack cblock; PacketBlock blockr, blocki; - bload(cblock, rhs, i, j); + bload(cblock, rhs2, i, 0); - blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); - blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); + blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); + blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); - blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); - blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); + blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v); + blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v); if(Conjugate) { @@ -955,8 +993,8 @@ struct dhs_cpack(blockBt + rir, blockr); - storeBlock(blockBt + rii, blocki); + storeBlock(blockBt + rir, blockr); + storeBlock(blockBt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -965,27 +1003,27 @@ struct dhs_cpack -EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { - acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); + for (int M = 0; M < N; M++) { + acc->packet[M] = vec_nmsub(lhsV, rhsV[M], acc->packet[M]); + } } else { - acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); - } -} - -template -EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) -{ - if(NegativeAccumulate) - { - acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); - } else { - acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + for (int M = 0; M < N; M++) { + acc->packet[M] = vec_madd(lhsV, rhsV[M], acc->packet[M]); + } } } @@ -1028,559 +1053,628 @@ EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con { Packet lhsV = pload(lhs); - pger_common(acc, lhsV, rhsV); -} - -template -EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) -{ -#ifdef _ARCH_PWR9 - lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); -#else - Index i = 0; - do { - lhsV[i] = lhs[i]; - } while (++i < remaining_rows); -#endif -} - -template -EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) -{ - Packet lhsV; - loadPacketRemaining(lhs, lhsV, remaining_rows); - - pger_common(acc, lhsV, rhsV); + pger_common(acc, lhsV, rhsV); } // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template -EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) { - pger_common(accReal, lhsV, rhsV); + pger_common(accReal, lhsV, rhsV); if(LhsIsReal) { - pger_common(accImag, lhsV, rhsVi); + pger_common(accImag, lhsV, rhsVi); EIGEN_UNUSED_VARIABLE(lhsVi); } else { if (!RhsIsReal) { - pger_common(accReal, lhsVi, rhsVi); - pger_common(accImag, lhsV, rhsVi); + pger_common(accReal, lhsVi, rhsVi); + pger_common(accImag, lhsV, rhsVi); } else { EIGEN_UNUSED_VARIABLE(rhsVi); } - pger_common(accImag, lhsVi, rhsV); + pger_common(accImag, lhsVi, rhsV); } } template EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { - Packet lhsV = ploadLhs(lhs_ptr); + Packet lhsV = ploadLhs(lhs_ptr); Packet lhsVi; - if(!LhsIsReal) lhsVi = ploadLhs(lhs_ptr_imag); + if(!LhsIsReal) lhsVi = ploadLhs(lhs_ptr_imag); else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); } -template -EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) -{ -#ifdef _ARCH_PWR9 - lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); - if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows * sizeof(Scalar)); - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); -#else - Index i = 0; - do { - lhsV[i] = lhs_ptr[i]; - if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i]; - } while (++i < remaining_rows); - if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); -#endif -} - -template -EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) -{ - Packet lhsV, lhsVi; - loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); - - pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); -} - -template -EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) +template +EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs) { return ploadu(lhs); } // Zero the accumulator on PacketBlock. -template -EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { - acc.packet[0] = pset1((Scalar)0); - acc.packet[1] = pset1((Scalar)0); - acc.packet[2] = pset1((Scalar)0); - acc.packet[3] = pset1((Scalar)0); + for (int M = 0; M < N; M++) { + acc.packet[M] = pset1((__UNPACK_TYPE__(Packet))0); + } } -template -EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { - acc.packet[0] = pset1((Scalar)0); + for (int M = 0; M < N; M++) { + acc.packet[M] = vec_mul(accZ.packet[M], pAlpha); + } } -// Scale the PacketBlock vectors by alpha. -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +template +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) { - acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); - acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); - acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); - acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); -} - -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); -} - -template -EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmul(accZ.packet[0], pAlpha); - acc.packet[1] = pmul(accZ.packet[1], pAlpha); - acc.packet[2] = pmul(accZ.packet[2], pAlpha); - acc.packet[3] = pmul(accZ.packet[3], pAlpha); -} - -template -EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmul(accZ.packet[0], pAlpha); + for (int M = 0; M < N; M++) { + acc.packet[M] = pand(acc.packet[M], pMask); + } } // Complex version of PacketBlock scaling. -template -EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) { - bscalec_common(cReal, aReal, bReal); + if (mask && (sizeof(__UNPACK_TYPE__(Packet)) == sizeof(float))) { + band(aReal, pMask); + band(aImag, pMask); + } else { + EIGEN_UNUSED_VARIABLE(pMask); + } - bscalec_common(cImag, aImag, bReal); + bscalec_common(cReal, aReal, bReal); - pger_common(&cReal, bImag, aImag.packet); + bscalec_common(cImag, aImag, bReal); - pger_common(&cImag, bImag, aReal.packet); -} + pger_common(&cReal, bImag, aImag.packet); -template -EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) -{ - acc.packet[0] = pand(acc.packet[0], pMask); - acc.packet[1] = pand(acc.packet[1], pMask); - acc.packet[2] = pand(acc.packet[2], pMask); - acc.packet[3] = pand(acc.packet[3], pMask); -} - -template -EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) -{ - band(aReal, pMask); - band(aImag, pMask); - - bscalec(aReal, aImag, bReal, bImag, cReal, cImag); + pger_common(&cImag, bImag, aReal.packet); } // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +// +// full = operate (load) on the entire PacketBlock or only half +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); + for (int M = 0; M < N; M++) { + acc.packet[M] = res.template loadPacket(row + M, col); + } + if (Complex) { + for (int M = 0; M < N; M++) { + acc.packet[M+N] = res.template loadPacket(row + M, col + accCols); + } + } } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); - } -} - -// An overload of bload when you have a PacketBLock with 8 vectors. -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); - acc.packet[4] = res.template loadPacket(row + 0, col + (N+1)*accCols); - acc.packet[5] = res.template loadPacket(row + 1, col + (N+1)*accCols); - acc.packet[6] = res.template loadPacket(row + 2, col + (N+1)*accCols); - acc.packet[7] = res.template loadPacket(row + 3, col + (N+1)*accCols); - } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); - acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); - acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); - acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); - acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); - } -} - -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); -} - -const static Packet4i mask41 = { -1, 0, 0, 0 }; -const static Packet4i mask42 = { -1, -1, 0, 0 }; -const static Packet4i mask43 = { -1, -1, -1, 0 }; - -const static Packet2l mask21 = { -1, 0 }; - -template -EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) -{ - if (remaining_rows == 0) { - return pset1(float(0.0)); // Not used - } else { - switch (remaining_rows) { - case 1: return Packet(mask41); - case 2: return Packet(mask42); - default: return Packet(mask43); + for (int M = 0; M < N; M++) { + acc.packet[M] = res.template loadPacket(row, col + M); + } + if (Complex && full) { + for (int M = 0; M < N; M++) { + acc.packet[M+N] = res.template loadPacket(row + accCols, col + M); + } } } } -template<> -EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) +template +EIGEN_ALWAYS_INLINE void bstore(PacketBlock& acc, const DataMapper& res, Index row) { - if (remaining_rows == 0) { - return pset1(double(0.0)); // Not used - } else { - return Packet2d(mask21); + for (int M = 0; M < N; M++) { + res.template storePacket(row, M, acc.packet[M]); } } -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +#ifdef USE_PARTIAL_PACKETS +template +EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements) { - band(accZ, pMask); - - bscale(acc, accZ, pAlpha); + for (Index M = 0; M < N; M++) { + acc.packet[M] = res.template loadPacketPartial(row, M, elements); + } + if (Complex && full) { + for (Index M = 0; M < N; M++) { + acc.packet[M+N] = res.template loadPacketPartial(row + accCols, M, elements); + } + } } -template -EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +template +EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements) { - pbroadcast4(a, a0, a1, a2, a3); + for (Index M = 0; M < N; M++) { + res.template storePacketPartial(row, M, acc.packet[M], elements); + } +} +#endif + +#ifdef _ARCH_PWR10 +#define USE_P10_AND_PVIPR2_0 (EIGEN_COMP_LLVM || (__GNUC__ >= 11)) +#else +#define USE_P10_AND_PVIPR2_0 0 +#endif + +#if !USE_P10_AND_PVIPR2_0 +const static Packet4i mask4[4] = { { 0, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, -1, 0, 0 }, { -1, -1, -1, 0 } }; +#endif + +template +EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows) +{ +#if USE_P10_AND_PVIPR2_0 +#ifdef _BIG_ENDIAN + return Packet(vec_reve(vec_genwm((1 << remaining_rows) - 1))); +#else + return Packet(vec_genwm((1 << remaining_rows) - 1)); +#endif +#else + return Packet(mask4[remaining_rows]); +#endif } template<> -EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +EIGEN_ALWAYS_INLINE Packet2d bmask(const Index remaining_rows) { - a1 = pload(a); - a3 = pload(a + 2); +#if USE_P10_AND_PVIPR2_0 + Packet2d mask2 = Packet2d(vec_gendm(remaining_rows)); +#ifdef _BIG_ENDIAN + return preverse(mask2); +#else + return mask2; +#endif +#else + Packet2l ret = { -remaining_rows, 0 }; + return Packet2d(ret); +#endif +} + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + for (int M = 0; M < N; M++) { + acc.packet[M] = pmadd(pAlpha, accZ.packet[M], acc.packet[M]); + } +} + +// Scale the PacketBlock vectors by alpha. +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +{ + if (mask) { + band(accZ, pMask); + } else { + EIGEN_UNUSED_VARIABLE(pMask); + } + + bscale(acc, accZ, pAlpha); +} + +template +EIGEN_ALWAYS_INLINE void pbroadcastN(const __UNPACK_TYPE__(Packet) *ap0, + const __UNPACK_TYPE__(Packet) *ap1, const __UNPACK_TYPE__(Packet) *ap2, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pset1(ap0[0]); + if (N == 4) { + a1 = pset1(ap0[1]); + a2 = pset1(ap0[2]); + a3 = pset1(ap0[3]); + EIGEN_UNUSED_VARIABLE(ap1); + EIGEN_UNUSED_VARIABLE(ap2); + } else { + if (N > 1) { + a1 = pset1(ap1[0]); + } else { + EIGEN_UNUSED_VARIABLE(a1); + EIGEN_UNUSED_VARIABLE(ap1); + } + if (N > 2) { + a2 = pset1(ap2[0]); + } else { + EIGEN_UNUSED_VARIABLE(a2); + EIGEN_UNUSED_VARIABLE(ap2); + } + } +} + +template<> EIGEN_ALWAYS_INLINE void +pbroadcastN(const float *ap0, const float *, const float *, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + pbroadcast4(ap0, a0, a1, a2, a3); +} + +template<> EIGEN_ALWAYS_INLINE void +pbroadcastN(const float *ap0, const float *ap1, const float *ap2, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + pbroadcastN(ap0, ap1, ap2, a0, a1, a2, a3); +} + +template<> +EIGEN_ALWAYS_INLINE void pbroadcastN(const double* ap0, const double *, + const double *, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload(ap0); + a3 = pload(ap0 + 2); a0 = vec_splat(a1, 0); a1 = vec_splat(a1, 1); a2 = vec_splat(a3, 0); a3 = vec_splat(a3, 1); } +// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + for (int M = 0; M < N; M++) { + acc1.packet[M].v = vec_mergeh(taccReal.packet[M], taccImag.packet[M]); + } + + if (full) { + for (int M = 0; M < N; M++) { + acc2.packet[M].v = vec_mergel(taccReal.packet[M], taccImag.packet[M]); + } + } +} + +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +{ + bcouple_common(taccReal, taccImag, acc1, acc2); + + for (int M = 0; M < N; M++) { + acc1.packet[M] = padd(tRes.packet[M], acc1.packet[M]); + } + + if (full) { + for (int M = 0; M < N; M++) { + acc2.packet[M] = padd(tRes.packet[M+N], acc2.packet[M]); + } + } +} + // PEEL loop factor. #define PEEL 7 - -template -EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( - const Scalar* &lhs_ptr, - const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows, - Index remaining_cols) -{ - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; -} - -template -EIGEN_STRONG_INLINE void gemm_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha) -{ - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero; - - bsetzero(accZero); - - Index remaining_depth = (depth & -accRows); - Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); - } - } - for(; k < remaining_depth; k++) - { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); - } - for(; k < depth; k++) - { - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; - } - - accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]); - for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col) += accZero.packet[0][i]; - } -} - -template -EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( - const Scalar* &lhs_ptr, - const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows) -{ - Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); - lhs_ptr += remaining_rows; - rhs_ptr += accRows; -} - -template -EIGEN_STRONG_INLINE void gemm_extra_row( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index row, - Index col, - Index rows, - Index cols, - Index remaining_rows, - const Packet& pAlpha, - const Packet& pMask) -{ - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero, acc; - - bsetzero(accZero); - - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); - Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); - } - } - for(; k < remaining_depth; k++) - { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); - } - - if ((remaining_depth == depth) && (rows >= accCols)) - { - for(Index j = 0; j < 4; j++) { - acc.packet[j] = res.template loadPacket(row, col + j); - } - bscale(acc, accZero, pAlpha, pMask); - res.template storePacketBlock(row, col, acc); - } else { - for(; k < depth; k++) - { - Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); - lhs_ptr += remaining_rows; - rhs_ptr += accRows; - } - - for(Index j = 0; j < 4; j++) { - accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]); - } - for(Index j = 0; j < 4; j++) { - for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col + j) += accZero.packet[j][i]; - } - } - } -} +#define PEEL_ROW 7 #define MICRO_UNROLL(func) \ func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) -#define MICRO_UNROLL_WORK(func, func2, peel) \ - MICRO_UNROLL(func2); \ - func(0,peel) func(1,peel) func(2,peel) func(3,peel) \ - func(4,peel) func(5,peel) func(6,peel) func(7,peel) +#define MICRO_NORMAL_ROWS \ + accRows == quad_traits::rows || accRows == 1 -#define MICRO_LOAD_ONE(iter) \ - if (unroll_factor > iter) { \ - lhsV##iter = ploadLhs(lhs_ptr##iter); \ - lhs_ptr##iter += accCols; \ +#define MICRO_NEW_ROWS ((MICRO_NORMAL_ROWS) ? accRows : 1) + +#define MICRO_RHS(ptr, N) rhs_##ptr##N + +#define MICRO_ZERO_PEEL(peel) \ + if ((PEEL_ROW > peel) && (peel != 0)) { \ + bsetzero(accZero##peel); \ } else { \ - EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + EIGEN_UNUSED_VARIABLE(accZero##peel); \ } +#define MICRO_ADD(ptr, N) \ + if (MICRO_NORMAL_ROWS) { \ + MICRO_RHS(ptr,0) += (accRows * N); \ + } else { \ + MICRO_RHS(ptr,0) += N; \ + MICRO_RHS(ptr,1) += N; \ + if (accRows == 3) { \ + MICRO_RHS(ptr,2) += N; \ + } \ + } + +#define MICRO_ADD_ROWS(N) MICRO_ADD(ptr, N) + +#define MICRO_BROADCAST1(peel, ptr, rhsV, real) \ + if (MICRO_NORMAL_ROWS) { \ + pbroadcastN(MICRO_RHS(ptr,0) + (accRows * peel), MICRO_RHS(ptr,0), MICRO_RHS(ptr,0), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + } else { \ + pbroadcastN(MICRO_RHS(ptr,0) + peel, MICRO_RHS(ptr,1) + peel, MICRO_RHS(ptr,2) + peel, rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + } + +#define MICRO_BROADCAST(peel) MICRO_BROADCAST1(peel, ptr, rhsV, true) + +#define MICRO_BROADCAST_EXTRA1(ptr, rhsV, real) \ + pbroadcastN(MICRO_RHS(ptr,0), MICRO_RHS(ptr,1), MICRO_RHS(ptr,2), rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + +#define MICRO_BROADCAST_EXTRA \ + Packet rhsV[4]; \ + MICRO_BROADCAST_EXTRA1(ptr, rhsV, true) \ + MICRO_ADD_ROWS(1) + +#define MICRO_SRC2(ptr, N, M) \ + if (MICRO_NORMAL_ROWS) { \ + EIGEN_UNUSED_VARIABLE(strideB); \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,1)); \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \ + } else { \ + MICRO_RHS(ptr,1) = rhs_base + N + M; \ + if (accRows == 3) { \ + MICRO_RHS(ptr,2) = rhs_base + N*2 + M; \ + } else { \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \ + } \ + } + +#define MICRO_SRC2_PTR MICRO_SRC2(ptr, strideB, 0) + +#define MICRO_ZERO_PEEL_ROW MICRO_UNROLL(MICRO_ZERO_PEEL) + +#define MICRO_WORK_PEEL(peel) \ + if (PEEL_ROW > peel) { \ + MICRO_BROADCAST(peel) \ + pger(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \ + MICRO_UNROLL(MICRO_WORK_PEEL) \ + lhs_ptr += (remaining_rows * PEEL_ROW); \ + MICRO_ADD_ROWS(PEEL_ROW) + +#define MICRO_ADD_PEEL(peel, sum) \ + if (PEEL_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accZero##sum.packet[i] += accZero##peel.packet[i]; \ + } \ + } + +#define MICRO_ADD_PEEL_ROW \ + MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \ + MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0) + +#define MICRO_PREFETCHN1(ptr, N) \ + EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,0)); \ + if (N == 2 || N == 3) { \ + EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,1)); \ + if (N == 3) { \ + EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,2)); \ + } \ + } + +#define MICRO_PREFETCHN(N) MICRO_PREFETCHN1(ptr, N) + +#define MICRO_COMPLEX_PREFETCHN(N) \ + MICRO_PREFETCHN1(ptr_real, N); \ + if(!RhsIsReal) { \ + MICRO_PREFETCHN1(ptr_imag, N); \ + } + +template +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( + const Scalar* &lhs_ptr, + const Scalar* &rhs_ptr0, + const Scalar* &rhs_ptr1, + const Scalar* &rhs_ptr2, + PacketBlock &accZero) +{ + MICRO_BROADCAST_EXTRA + pger(&accZero, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; +} + +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index rows, + const Packet& pAlpha, + const Packet& pMask) +{ + const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL; + const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc; + + MICRO_SRC2_PTR + bsetzero(accZero0); + + Index remaining_depth = depth & -quad_traits::rows; + Index k = 0; + if (remaining_depth >= PEEL_ROW) { + MICRO_ZERO_PEEL_ROW + do + { + MICRO_PREFETCHN(accRows) + EIGEN_POWER_PREFETCH(lhs_ptr); + MICRO_WORK_PEEL_ROW + } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth); + MICRO_ADD_PEEL_ROW + } + for(; k < depth; k++) + { + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0); + } + +#ifdef USE_PARTIAL_PACKETS + EIGEN_UNUSED_VARIABLE(rows); + EIGEN_UNUSED_VARIABLE(pMask); + bload_partial(acc, res, row, remaining_rows); + bscale(acc, accZero0, pAlpha); + bstore_partial(acc, res, row, remaining_rows); +#else + bload(acc, res, row, 0); + if ((accRows == 1) || (rows >= accCols)) + { + bscale(acc, accZero0, pAlpha, pMask); + bstore(acc, res, row); + } else { + bscale(acc, accZero0, pAlpha, pMask); + for(Index j = 0; j < accRows; j++) { + for(Index i = 0; i < remaining_rows; i++) { + res(row + i, j) = acc.packet[j][i]; + } + } + } +#endif +} + +#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \ + switch(value) { \ + default: \ + MICRO_EXTRA_UNROLL(1) \ + break; \ + case 2: \ + if (is_col || (sizeof(Scalar) == sizeof(float))) { \ + MICRO_EXTRA_UNROLL(2) \ + } \ + break; \ + case 3: \ + if (is_col || (sizeof(Scalar) == sizeof(float))) { \ + MICRO_EXTRA_UNROLL(3) \ + } \ + break; \ + } + +#define MICRO_EXTRA_ROWS(N) \ + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask); + +template +EIGEN_ALWAYS_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index rows, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + MICRO_EXTRA(MICRO_EXTRA_ROWS, remaining_rows, false) +} + +#define MICRO_UNROLL_WORK(func, func2, peel) \ + MICRO_UNROLL(func2); \ + func(0,peel) func(1,peel) func(2,peel) func(3,peel) \ + func(4,peel) func(5,peel) func(6,peel) func(7,peel) + #define MICRO_WORK_ONE(iter, peel) \ if (unroll_factor > iter) { \ - pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ + pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ } #define MICRO_TYPE_PEEL4(func, func2, peel) \ if (PEEL > peel) { \ Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - pbroadcast4(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ - MICRO_UNROLL_WORK(func, func2, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - } - -#define MICRO_TYPE_PEEL1(func, func2, peel) \ - if (PEEL > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - rhsV##peel[0] = pset1(rhs_ptr[remaining_cols * peel]); \ + MICRO_BROADCAST(peel) \ MICRO_UNROLL_WORK(func, func2, peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ - func(func1,func2,0); func(func1,func2,1); \ - func(func1,func2,2); func(func1,func2,3); \ - func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \ + func(func1,func2,0) func(func1,func2,1) \ + func(func1,func2,2) func(func1,func2,3) \ + func(func1,func2,4) func(func1,func2,5) \ + func(func1,func2,6) func(func1,func2,7) #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M]; \ - func(func1,func2,0); + func(func1,func2,0) -#define MICRO_ONE_PEEL4 \ - MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += (accRows * PEEL); +#define MICRO_UNROLL_TYPE(MICRO_TYPE, size) \ + MICRO_TYPE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE) \ + MICRO_ADD_ROWS(size) -#define MICRO_ONE4 \ - MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += accRows; +#define MICRO_ONE_PEEL4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_PEEL, PEEL) -#define MICRO_ONE_PEEL1 \ - MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += (remaining_cols * PEEL); - -#define MICRO_ONE1 \ - MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += remaining_cols; +#define MICRO_ONE4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_ONE, 1) #define MICRO_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accZero##iter); \ + bsetzero(accZero##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accZero##iter); \ } #define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE) -#define MICRO_SRC_PTR_ONE(iter) \ - if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ - } - #define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE) -#define MICRO_PREFETCH_ONE(iter) \ - if (unroll_factor > iter) { \ - EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ - } - #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE) +#ifdef USE_PARTIAL_PACKETS #define MICRO_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ - acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ - acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ + if (MICRO_NORMAL_PARTIAL(iter)) { \ + bload(acc, res, row + iter*accCols, 0); \ + bscale(acc, accZero##iter, pAlpha); \ + bstore(acc, res, row + iter*accCols); \ + } else { \ + bload_partial(acc, res, row + iter*accCols, accCols2); \ + bscale(acc, accZero##iter, pAlpha); \ + bstore_partial(acc, res, row + iter*accCols, accCols2); \ + } \ } +#else +#define MICRO_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + bload(acc, res, row + iter*accCols, 0); \ + bscale(acc, accZero##iter, pAlpha, pMask); \ + bstore(acc, res, row + iter*accCols); \ + } +#endif #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) -#define MICRO_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ - } - -#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) - -template -EIGEN_STRONG_INLINE void gemm_unrolled_iteration( +#ifdef USE_PARTIAL_PACKETS +template +#else +template +#endif +EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, + Index strideB, Index& row, - Index col, - const Packet& pAlpha) + const Packet& pAlpha, +#ifdef USE_PARTIAL_PACKETS + Index accCols2 +#else + const Packet& pMask +#endif + ) { - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; + const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; + MICRO_SRC2_PTR MICRO_SRC_PTR MICRO_DST_PTR Index k = 0; for(; k + PEEL <= depth; k+= PEEL) { - EIGEN_POWER_PREFETCH(rhs_ptr); + MICRO_PREFETCHN(accRows) MICRO_PREFETCH MICRO_ONE_PEEL4 } @@ -1590,197 +1684,139 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( } MICRO_STORE - row += unroll_factor*accCols; + MICRO_UPDATE } -template -EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( +#ifdef USE_PARTIAL_PACKETS +#define MICRO_UNROLL_ITER2(N, M) \ + gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \ + if (M) return; +#else +#define MICRO_UNROLL_ITER2(N, M) \ + gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \ + if (M) return; +#endif + +template +EIGEN_ALWAYS_INLINE void gemm_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, - Index& row, + Index strideB, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlpha) -{ - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; - - MICRO_SRC_PTR - MICRO_DST_PTR - - Index k = 0; - for(; k + PEEL <= depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - MICRO_PREFETCH - MICRO_ONE_PEEL1 - } - for(; k < depth; k++) - { - MICRO_ONE1 - } - MICRO_COL_STORE - - row += unroll_factor*accCols; -} - -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index& row, Index rows, - Index col, - Index remaining_cols, - const Packet& pAlpha) + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) { -#define MAX_UNROLL 6 + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + col*strideB + MICRO_NEW_ROWS*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_UNROLL 7 while(row + MAX_UNROLL*accCols <= rows) { - gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + MICRO_UNROLL_ITER2(MAX_UNROLL, 0); } switch( (rows-row)/accCols ) { #if MAX_UNROLL > 7 case 7: - gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 7) break; #endif #if MAX_UNROLL > 6 case 6: - gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 6) break; #endif #if MAX_UNROLL > 5 - case 5: - gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 5: + MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 5) break; #endif #if MAX_UNROLL > 4 - case 4: - gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 4: + MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 4) break; #endif #if MAX_UNROLL > 3 - case 3: - gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 3: + MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 3) + break; #endif #if MAX_UNROLL > 2 - case 2: - gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 2: + MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 2) + break; #endif #if MAX_UNROLL > 1 - case 1: - gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 1: + MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 1) + break; #endif - default: - break; + default: + break; } #undef MAX_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask); + } +} + +#define MICRO_EXTRA_COLS(N) \ + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); + +template +EIGEN_STRONG_INLINE void gemm_extra_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + MICRO_EXTRA(MICRO_EXTRA_COLS, cols-col, true) } /**************** * GEMM kernels * * **************/ -template +template EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; const Packet pAlpha = pset1(alpha); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_UNROLL 6 - while(row + MAX_UNROLL*accCols <= rows) { - gemm_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { -#if MAX_UNROLL > 7 - case 7: - gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 6 - case 6: - gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 5 - case 5: - gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 4 - case 4: - gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 3 - case 3: - gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 2 - case 2: - gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 1 - case 1: - gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif - default: - break; - } -#undef MAX_UNROLL - - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } - } - - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); - - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); + } + + if (col != cols) + { + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } - } } #define accColsC (accCols / 2) @@ -1789,127 +1825,108 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const // PEEL_COMPLEX loop factor. #define PEEL_COMPLEX 3 +#define PEEL_COMPLEX_ROW 3 -template -EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( - const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, - const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows, - Index remaining_cols) -{ - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; +#define MICRO_COMPLEX_UNROLL(func) \ + func(0) func(1) func(2) func(3) + +#define MICRO_COMPLEX_ZERO_PEEL(peel) \ + if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \ + bsetzero(accReal##peel); \ + bsetzero(accImag##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##peel); \ + EIGEN_UNUSED_VARIABLE(accImag##peel); \ + } + +#define MICRO_COMPLEX_ADD_ROWS(N, used) \ + MICRO_ADD(ptr_real, N) \ + if (!RhsIsReal) { \ + MICRO_ADD(ptr_imag, N) \ + } else if (used) { \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \ + } + +#define MICRO_COMPLEX_BROADCAST(peel) \ + MICRO_BROADCAST1(peel, ptr_real, rhsV, false) \ + if (!RhsIsReal) { \ + MICRO_BROADCAST1(peel, ptr_imag, rhsVi, false) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_BROADCAST_EXTRA \ + Packet rhsV[4], rhsVi[4]; \ + MICRO_BROADCAST_EXTRA1(ptr_real, rhsV, false) \ + if(!RhsIsReal) { \ + MICRO_BROADCAST_EXTRA1(ptr_imag, rhsVi, false) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi); \ + } \ + MICRO_COMPLEX_ADD_ROWS(1, true) + +#define MICRO_COMPLEX_SRC2_PTR \ + MICRO_SRC2(ptr_real, strideB*advanceCols, 0) \ + if (!RhsIsReal) { \ + MICRO_RHS(ptr_imag,0) = rhs_base + MICRO_NEW_ROWS*strideB; \ + MICRO_SRC2(ptr_imag, strideB*advanceCols, strideB) \ + } else { \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \ + EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \ + } + +#define MICRO_COMPLEX_ZERO_PEEL_ROW MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_ZERO_PEEL) + +#define MICRO_COMPLEX_WORK_PEEL(peel) \ + if (PEEL_COMPLEX_ROW > peel) { \ + MICRO_COMPLEX_BROADCAST(peel) \ + pgerc(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_ADD_COLS(size) \ + lhs_ptr_real += (remaining_rows * size); \ + if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * size); \ else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); -} -template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag) -{ - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; - if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; +#define MICRO_COMPLEX_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \ + Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \ + MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_WORK_PEEL) \ + MICRO_COMPLEX_ADD_COLS(PEEL_COMPLEX_ROW) \ + MICRO_COMPLEX_ADD_ROWS(PEEL_COMPLEX_ROW, false) - bsetzero(accReal); - bsetzero(accImag); - - Index remaining_depth = (depth & -accRows); - Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); - } - } - for(; k < remaining_depth; k++) - { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); +#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \ + if (PEEL_COMPLEX_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accReal##sum.packet[i] += accReal##peel.packet[i]; \ + accImag##sum.packet[i] += accImag##peel.packet[i]; \ + } \ } - for(; k < depth; k++) - { - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - } +#define MICRO_COMPLEX_ADD_PEEL_ROW \ + MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \ + MICRO_COMPLEX_ADD_PEEL(1, 0) - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); - - if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) - { - res(row + 0, col + 0) += pfirst(acc0.packet[0]); - } else { - acc0.packet[0] += res.template loadPacket(row + 0, col + 0); - res.template storePacketBlock(row + 0, col + 0, acc0); - if(remaining_rows > accColsC) { - res(row + accColsC, col + 0) += pfirst(acc1.packet[0]); - } - } -} - -template +template EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, - const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows) + const Scalar* &rhs_ptr_real0, const Scalar* &rhs_ptr_real1, const Scalar* &rhs_ptr_real2, + const Scalar* &rhs_ptr_imag0, const Scalar* &rhs_ptr_imag1, const Scalar* &rhs_ptr_imag2, + PacketBlock &accReal, PacketBlock &accImag) { - Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - rhs_ptr_real += accRows; - if(!RhsIsReal) rhs_ptr_imag += accRows; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + MICRO_COMPLEX_BROADCAST_EXTRA + pgerc(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + MICRO_COMPLEX_ADD_COLS(1) } -template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1918,150 +1935,113 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( Index offsetA, Index strideB, Index row, - Index col, Index rows, - Index cols, - Index remaining_rows, const Packet& pAlphaReal, const Packet& pAlphaImag, const Packet& pMask) { - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL; + const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL; const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; + const Scalar* lhs_ptr_imag = NULL; if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + PacketBlock accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; - bsetzero(accReal); - bsetzero(accImag); + MICRO_COMPLEX_SRC2_PTR - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + bsetzero(accReal0); + bsetzero(accImag0); + + Index remaining_depth = depth & -quad_traits::rows; Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); - } - } - for(; k < remaining_depth; k++) - { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); - } - - if ((remaining_depth == depth) && (rows >= accCols)) - { - bload(tRes, res, row, col); - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); - bcouple(taccReal, taccImag, tRes, acc0, acc1); - res.template storePacketBlock(row + 0, col, acc0); - res.template storePacketBlock(row + accColsC, col, acc1); - } else { - for(; k < depth; k++) + if (remaining_depth >= PEEL_COMPLEX_ROW) { + MICRO_COMPLEX_ZERO_PEEL_ROW + do { - Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - rhs_ptr_real += accRows; - if(!RhsIsReal) rhs_ptr_imag += accRows; - } + MICRO_COMPLEX_PREFETCHN(accRows) + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + MICRO_COMPLEX_WORK_PEEL_ROW + } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth); + MICRO_COMPLEX_ADD_PEEL_ROW + } + for(; k < depth; k++) + { + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0); + } - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); + constexpr bool full = (remaining_rows > accColsC); + bload(tRes, res, row, 0); + if ((accRows == 1) || (rows >= accCols)) + { + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); + bcouple(taccReal, taccImag, tRes, acc0, acc1); + bstore(acc0, res, row + 0); + if (full) { + bstore(acc1, res, row + accColsC); + } + } else { + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); + bcouple(taccReal, taccImag, tRes, acc0, acc1); if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) { - for(Index j = 0; j < 4; j++) { - res(row + 0, col + j) += pfirst(acc0.packet[j]); + for(Index j = 0; j < accRows; j++) { + res(row + 0, j) = pfirst(acc0.packet[j]); } } else { - for(Index j = 0; j < 4; j++) { - PacketBlock acc2; - acc2.packet[0] = res.template loadPacket(row + 0, col + j) + acc0.packet[j]; - res.template storePacketBlock(row + 0, col + j, acc2); - if(remaining_rows > accColsC) { - res(row + accColsC, col + j) += pfirst(acc1.packet[j]); + bstore(acc0, res, row + 0); + if (full) { + for(Index j = 0; j < accRows; j++) { + res(row + accColsC, j) = pfirst(acc1.packet[j]); } } } } } -#define MICRO_COMPLEX_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) +#define MICRO_COMPLEX_EXTRA_ROWS(N) \ + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask); + +template +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index rows, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + MICRO_EXTRA(MICRO_COMPLEX_EXTRA_ROWS, remaining_rows, false) +} #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ - MICRO_COMPLEX_UNROLL(func2); \ - func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) - -#define MICRO_COMPLEX_LOAD_ONE(iter) \ - if (unroll_factor > iter) { \ - lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ - if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ - } \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhsV##iter); \ - EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ - } + MICRO_COMPLEX_UNROLL(func2); \ + func(0,peel) func(1,peel) func(2,peel) func(3,peel) #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ if (unroll_factor > iter) { \ - pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ - } - -#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ - if (unroll_factor > iter) { \ - pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + pgerc_common(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ } #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - pbroadcast4_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ - if(!RhsIsReal) { \ - pbroadcast4_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } \ - MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } - -#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ - if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - rhsV##peel[0] = pset1(rhs_ptr_real[remaining_cols * peel]); \ - if(!RhsIsReal) { \ - rhsVi##peel[0] = pset1(rhs_ptr_imag[remaining_cols * peel]); \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ + MICRO_COMPLEX_BROADCAST(peel) \ MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ @@ -2069,42 +2049,27 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( } #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ - Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ - func(func1,func2,0); func(func1,func2,1); \ - func(func1,func2,2); func(func1,func2,3); \ - func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \ + Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \ + func(func1,func2,0) func(func1,func2,1) \ + func(func1,func2,2) func(func1,func2,3) #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M], rhsVi0[M];\ - func(func1,func2,0); + func(func1,func2,0) -#define MICRO_COMPLEX_ONE_PEEL4 \ - MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += (accRows * PEEL_COMPLEX); \ - if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX); +#define MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_TYPE, size) \ + MICRO_COMPLEX_TYPE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE) \ + MICRO_COMPLEX_ADD_ROWS(size, false) -#define MICRO_COMPLEX_ONE4 \ - MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += accRows; \ - if(!RhsIsReal) rhs_ptr_imag += accRows; +#define MICRO_COMPLEX_ONE_PEEL4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_PEEL, PEEL_COMPLEX) -#define MICRO_COMPLEX_ONE_PEEL1 \ - MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ - if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); - -#define MICRO_COMPLEX_ONE1 \ - MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += remaining_cols; \ - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; +#define MICRO_COMPLEX_ONE4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_ONE, 1) #define MICRO_COMPLEX_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accReal##iter); \ - bsetzero(accImag##iter); \ + bsetzero(accReal##iter); \ + bsetzero(accImag##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accReal##iter); \ EIGEN_UNUSED_VARIABLE(accImag##iter); \ @@ -2112,55 +2077,26 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE) -#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ - if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } - #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) -#define MICRO_COMPLEX_PREFETCH_ONE(iter) \ - if (unroll_factor > iter) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ - } - #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) #define MICRO_COMPLEX_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \ + bload(tRes, res, row + iter*accCols, 0); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + bstore(acc0, res, row + iter*accCols + 0); \ + if (full) { \ + bstore(acc1, res, row + iter*accCols + accColsC); \ + } \ } #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) -#define MICRO_COMPLEX_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ - } - -#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) - -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -2169,37 +2105,30 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) { - rhs_ptr_imag = rhs_base + accRows*strideB; - } else { - EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL; + const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL; + const Index imag_delta = accCols*strideA; + const Index imag_delta2 = accCols2*strideA; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; + MICRO_COMPLEX_SRC2_PTR MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_DST_PTR Index k = 0; for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } + MICRO_COMPLEX_PREFETCHN(accRows) MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_ONE_PEEL4 } @@ -2209,122 +2138,107 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( } MICRO_COMPLEX_STORE - row += unroll_factor*accCols; + MICRO_COMPLEX_UPDATE } -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( +#define MICRO_COMPLEX_UNROLL_ITER2(N, M) \ + gemm_complex_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \ + if (M) return; + +template +EIGEN_ALWAYS_INLINE void gemm_complex_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag) -{ - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) { - rhs_ptr_imag = rhs_base + remaining_cols*strideB; - } else { - EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; - - MICRO_COMPLEX_SRC_PTR - MICRO_COMPLEX_DST_PTR - - Index k = 0; - for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - MICRO_COMPLEX_PREFETCH - MICRO_COMPLEX_ONE_PEEL1 - } - for(; k < depth; k++) - { - MICRO_COMPLEX_ONE1 - } - MICRO_COMPLEX_COL_STORE - - row += unroll_factor*accCols; -} - -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index& row, Index rows, - Index col, - Index remaining_cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { -#define MAX_COMPLEX_UNROLL 3 + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + advanceCols*col*strideB + MICRO_NEW_ROWS*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_COMPLEX_UNROLL 4 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + MICRO_COMPLEX_UNROLL_ITER2(MAX_COMPLEX_UNROLL, 0); } switch( (rows-row)/accCols ) { #if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 4: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4) + break; #endif #if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 3: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 3) + break; #endif #if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 2: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 2) + break; #endif #if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 1: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 1) + break; #endif - default: - break; + default: + break; } #undef MAX_COMPLEX_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } } -template +#define MICRO_COMPLEX_EXTRA_COLS(N) \ + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols-col, true) +} + +template EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; const Packet pAlphaReal = pset1(alpha.real()); const Packet pAlphaImag = pset1(alpha.imag()); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); const Scalar* blockA = (Scalar *) blockAc; const Scalar* blockB = (Scalar *) blockBc; @@ -2332,63 +2246,12 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_UNROLL 3 - while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) + if (col != cols) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } } @@ -2396,6 +2259,8 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl #undef advanceCols #undef advanceRows +#include "MatrixVectorProduct.h" + /************************************ * ppc64le template specializations * * **********************************/ @@ -2409,7 +2274,7 @@ template ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2423,7 +2288,7 @@ template ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2438,7 +2303,7 @@ template ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2452,7 +2317,7 @@ template ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } #endif @@ -2467,7 +2332,7 @@ template ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2481,7 +2346,7 @@ template ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2495,7 +2360,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2509,7 +2374,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2524,7 +2389,7 @@ template ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2538,7 +2403,7 @@ template ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_pack pack; + dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } #endif @@ -2553,7 +2418,7 @@ template, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2567,7 +2432,7 @@ template, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2581,7 +2446,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2595,7 +2460,7 @@ template, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockA, lhs, depth, rows, stride, offset); } @@ -2609,7 +2474,7 @@ template, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2623,7 +2488,7 @@ template, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { - dhs_cpack pack; + dhs_cpack pack; pack(blockB, rhs, depth, cols, stride, offset); } @@ -2649,20 +2514,20 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + gemm_function = &Eigen::internal::gemmMMA; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemmMMA; + gemm_function = &Eigen::internal::gemmMMA; } else{ - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; } #else - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2688,20 +2553,20 @@ void gebp_kernel, std::complex, Index, DataMapper, mr void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2726,20 +2591,20 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const float*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2764,20 +2629,20 @@ void gebp_kernel, float, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const float*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2801,20 +2666,20 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only - gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + gemm_function = &Eigen::internal::gemmMMA; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemmMMA; + gemm_function = &Eigen::internal::gemmMMA; } else{ - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; } #else - gemm_function = &Eigen::internal::gemm; + gemm_function = &Eigen::internal::gemm; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2839,20 +2704,20 @@ void gebp_kernel, std::complex, Index, DataMapper, const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2877,20 +2742,20 @@ void gebp_kernel, double, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const double*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2915,21 +2780,46 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const double*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } + +#if defined(__MMA__) +template +struct gebp_kernel +{ + typedef typename quad_traits::vectortype Packet; + typedef typename quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, + Index rows, Index depth, Index cols, bfloat16 alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel + ::operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, + Index rows, Index depth, Index cols, bfloat16 alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + + Eigen::internal::gemmMMAbfloat16(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } +#endif } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 33d5434..28868ca 100644 --- a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -5,33 +5,41 @@ #define EIGEN_POWER_PREFETCH(p) #endif +#ifdef _ARCH_PWR9 +#define USE_PARTIAL_PACKETS +#endif + +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template -EIGEN_STRONG_INLINE void gemm_extra_col( +template +EIGEN_ALWAYS_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, Index offsetA, + Index strideB, Index row, - Index col, + Index rows, Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha); + const Packet& pAlpha, + const Packet& pMask); -template -EIGEN_STRONG_INLINE void gemm_extra_row( +template +EIGEN_STRONG_INLINE void gemm_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, - Index row, + Index strideB, + Index offsetB, Index col, Index rows, Index cols, @@ -39,25 +47,11 @@ EIGEN_STRONG_INLINE void gemm_extra_row( const Packet& pAlpha, const Packet& pMask); -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index& row, - Index rows, - Index col, - Index remaining_cols, - const Packet& pAlpha); - template -EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); +EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows); -template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( +template +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -66,22 +60,22 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_col( Index offsetA, Index strideB, Index row, - Index col, + Index rows, Index remaining_rows, - Index remaining_cols, const Packet& pAlphaReal, - const Packet& pAlphaImag); + const Packet& pAlphaImag, + const Packet& pMask); -template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +template +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index row, + Index offsetB, Index col, Index rows, Index cols, @@ -90,132 +84,133 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pAlphaImag, const Packet& pMask); -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index& row, - Index rows, - Index col, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag); - -template -EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); - -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); - -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); - template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); +EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs); + +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); + +template +EIGEN_ALWAYS_INLINE void bstore(PacketBlock& acc, const DataMapper& res, Index row); + +#ifdef USE_PARTIAL_PACKETS +template +EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements); + +template +EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock& acc, const DataMapper& res, Index row, Index elements); +#endif template -EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); -const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, - 16, 17, 18, 19, - 4, 5, 6, 7, - 20, 21, 22, 23}; +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask); -const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, - 24, 25, 26, 27, - 12, 13, 14, 15, - 28, 29, 30, 31}; -//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 -const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, - 16, 17, 18, 19, 20, 21, 22, 23}; +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask); -//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 -const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, - 24, 25, 26, 27, 28, 29, 30, 31}; +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2); +#define MICRO_NORMAL(iter) \ + (accCols == accCols2) || (unroll_factor != (iter + 1)) -// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. -template -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); +#define MICRO_UNROLL_ITER1(func, N) \ + switch (remaining_rows) { \ + default: \ + func(N, 0) \ + break; \ + case 1: \ + func(N, 1) \ + break; \ + case 2: \ + if (sizeof(Scalar) == sizeof(float)) { \ + func(N, 2) \ + } \ + break; \ + case 3: \ + if (sizeof(Scalar) == sizeof(float)) { \ + func(N, 3) \ + } \ + break; \ + } - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); -} +#ifdef USE_PARTIAL_PACKETS +#define MICRO_UNROLL_ITER(func, N) \ + if (remaining_rows) { \ + func(N, true); \ + } else { \ + func(N, false); \ + } -template -EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) -{ - bcouple_common(taccReal, taccImag, acc1, acc2); +#define MICRO_NORMAL_PARTIAL(iter) \ + full || (unroll_factor != (iter + 1)) +#else +#define MICRO_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N) +#endif - acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); - acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); - acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); - acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); +#define MICRO_COMPLEX_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N) - acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); - acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); - acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); - acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); -} +#define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b) -template -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); +#define MICRO_LOAD1(lhs_ptr, iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr##iter); \ + lhs_ptr##iter += MICRO_NORMAL_COLS(iter, accCols, accCols2); \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + } - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); -} +#define MICRO_LOAD_ONE(iter) MICRO_LOAD1(lhs_ptr, iter) -template -EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) -{ - bcouple_common(taccReal, taccImag, acc1, acc2); +#define MICRO_COMPLEX_LOAD_ONE(iter) \ + if (!LhsIsReal && (unroll_factor > iter)) { \ + lhsVi##iter = ploadLhs(lhs_ptr_real##iter + MICRO_NORMAL_COLS(iter, imag_delta, imag_delta2)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } \ + MICRO_LOAD1(lhs_ptr_real, iter) \ - acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); +#define MICRO_SRC_PTR1(lhs_ptr, advRows, iter) \ + if (unroll_factor > iter) { \ + lhs_ptr##iter = lhs_base + (row+(iter*accCols))*strideA*advRows - MICRO_NORMAL_COLS(iter, 0, (accCols-accCols2)*offsetA); \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ + } - acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); -} +#define MICRO_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr, 1, iter) -template<> -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); +#define MICRO_COMPLEX_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr_real, advanceRows, iter) - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); -} +#define MICRO_PREFETCH1(lhs_ptr, iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ + } -template<> -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); +#define MICRO_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr, iter) - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); -} +#define MICRO_COMPLEX_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr_real, iter) + +#ifdef USE_PARTIAL_PACKETS +#define MICRO_UPDATE_MASK +#else +#define MICRO_UPDATE_MASK EIGEN_UNUSED_VARIABLE(pMask); +#endif + +#define MICRO_UPDATE \ + if (accCols == accCols2) { \ + MICRO_UPDATE_MASK \ + EIGEN_UNUSED_VARIABLE(offsetA); \ + row += unroll_factor*accCols; \ + } + +#define MICRO_COMPLEX_UPDATE \ + MICRO_UPDATE \ + if(LhsIsReal || (accCols == accCols2)) { \ + EIGEN_UNUSED_VARIABLE(imag_delta2); \ + } -// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. -template -EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs) -{ - return ploadu(rhs); -} } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 6540c6f..e4013a7 100644 --- a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -11,56 +11,87 @@ #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H -#pragma GCC target("cpu=power10") +// If using dynamic dispatch, set the CPU target. +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC push_options +#pragma GCC target("cpu=power10,htm") +#endif #ifdef __has_builtin #if !__has_builtin(__builtin_vsx_assemble_pair) #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair #endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair #endif +#endif + +#include "../../InternalHeaderCheck.h" + +#include "MatrixProductMMAbfloat16.h" namespace Eigen { namespace internal { -template +#define accColsC (accCols / 2) + EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); } -template -EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +#ifdef USE_PARTIAL_PACKETS +template +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc) +#else +template +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc) +#endif { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); PacketBlock tRes; - bload(tRes, data, i, j); - - bscale(tRes, result, alpha); - - data.template storePacketBlock(i, j, tRes); +#ifdef USE_PARTIAL_PACKETS + if (full) { + EIGEN_UNUSED_VARIABLE(elements); + bload(tRes, data, i, 0); + bscale(tRes, result, alpha); + bstore(tRes, data, i); + } else { + bload_partial(tRes, data, i, elements); + bscale(tRes, result, alpha); + bstore_partial(tRes, data, i, elements); + } +#else + bload(tRes, data, i, 0); + bscale(tRes, result, alpha, pMask); + bstore(tRes, data, i); +#endif } -template -EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +template +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal, __vector_quad* accImag) { + constexpr bool full = (accCols2 > accColsC); PacketBlock resultReal, resultImag; __builtin_mma_disassemble_acc(&resultReal.packet, accReal); __builtin_mma_disassemble_acc(&resultImag.packet, accImag); PacketBlock tRes; - bload(tRes, data, i, j); + bload(tRes, data, i, 0); - PacketBlock taccReal, taccImag; - bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag); + PacketBlock taccReal, taccImag; + bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag, pMask); PacketBlock acc1, acc2; - bcouple(taccReal, taccImag, tRes, acc1, acc2); + bcouple(taccReal, taccImag, tRes, acc1, acc2); - data.template storePacketBlock(i + N*accColsC, j, acc1); - data.template storePacketBlock(i + (N+1)*accColsC, j, acc2); + bstore(acc1, data, i); + if (full) { + bstore(acc2, data, i + accColsC); + } } // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments @@ -75,18 +106,6 @@ EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const L } } -template -EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) -{ - __vector_pair* a0 = (__vector_pair *)(&a.packet[0]); - if(NegativeAccumulate) - { - __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b); - } else { - __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b); - } -} - template EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) { @@ -98,18 +117,13 @@ EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, con } } -template -EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) -{ - // Just for compilation -} - -template -EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) +template +EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi, const RhsPacket& rhsV, RhsPacket& rhsVi) { pgerMMA(accReal, rhsV, lhsV); if(LhsIsReal) { pgerMMA(accImag, rhsVi, lhsV); + EIGEN_UNUSED_VARIABLE(lhsVi); } else { if(!RhsIsReal) { pgerMMA(accReal, rhsVi, lhsVi); @@ -122,129 +136,178 @@ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag } // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. +template +EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet)* rhs) +{ + return ploadu(rhs); +} + template EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) { - rhsV = ploadRhs((const Scalar*)(rhs)); + rhsV = ploadRhs(rhs); } template<> -EIGEN_ALWAYS_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) -{ - rhsV.packet[0] = ploadRhs((const double *)((Packet2d *)rhs )); - rhsV.packet[1] = ploadRhs((const double *)(((Packet2d *)rhs) + 1)); -} - -template<> -EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) { #if EIGEN_COMP_LLVM __builtin_vsx_assemble_pair(&rhsV, - (__vector unsigned char)(ploadRhs((const double *)(((Packet2d *)rhs) + 1))), - (__vector unsigned char)(ploadRhs((const double *)((Packet2d *)rhs )))); + reinterpret_cast<__vector unsigned char>(ploadRhs(rhs + (sizeof(Packet2d) / sizeof(double)))), + reinterpret_cast<__vector unsigned char>(ploadRhs(rhs))); #else - __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs)); + rhsV = *reinterpret_cast<__vector_pair *>(const_cast(rhs)); #endif } -template<> -EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) +EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) { - // Just for compilation + ploadRhsMMA(lhs, lhsV); } +#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11)) +#define VECTOR_PAIR_LOADS_LHS +#endif + // PEEL_MMA loop factor. #define PEEL_MMA 7 #define MICRO_MMA_UNROLL(func) \ func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) -#define MICRO_MMA_LOAD_ONE(iter) \ - if (unroll_factor > iter) { \ - lhsV##iter = ploadLhs(lhs_ptr##iter); \ - lhs_ptr##iter += accCols; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhsV##iter); \ - } +#define MICRO_MMA_WORK(func, type, peel) \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ + func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) #define MICRO_MMA_WORK_ONE(iter, type, peel) \ if (unroll_factor > iter) { \ - pgerMMA(&accZero##iter, rhsV##peel, lhsV##iter); \ + pgerMMA(&accZero##iter, rhsV[peel], lhsV##iter); \ } -#define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \ +#ifdef VECTOR_PAIR_LOADS_LHS +#define MICRO_MMA_WORK_TWO(iter, type, peel) \ + if (unroll_factor > iter) { \ + pgerMMA(&accZero##iter, rhsV[peel], lhsV2##iter.packet[peel & 1]); \ + } + +#define MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) \ + if (unroll_factor > iter) { \ + if (MICRO_NORMAL(iter)) { \ + ploadLhsMMA(reinterpret_cast(lhs_ptr##iter), plhsV##iter); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&lhsV2##iter.packet), &plhsV##iter); \ + lhs_ptr##iter += accCols*2; \ + } else { \ + lhsV2##iter.packet[0] = ploadLhs(lhs_ptr##iter); \ + lhsV2##iter.packet[1] = ploadLhs(lhs_ptr##iter + accCols2); \ + lhs_ptr##iter += accCols2*2; \ + EIGEN_UNUSED_VARIABLE(plhsV##iter) \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV2##iter); \ + EIGEN_UNUSED_VARIABLE(plhsV##iter) \ + } + +#define MICRO_MMA_LOAD_TWO(iter) MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) +#endif + +#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \ if (PEEL_MMA > peel) { \ Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV##peel); \ - MICRO_MMA_UNROLL(func2); \ - func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ - func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV[peel]); \ + MICRO_MMA_UNROLL(funcl) \ + MICRO_MMA_WORK(funcw, type, peel) \ } -#define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ - MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9); - -#define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \ - type rhsV0; \ - MICRO_MMA_TYPE_PEEL(func,func2,type,0); - -#define MICRO_MMA_ONE_PEEL \ - if (sizeof(Scalar) == sizeof(float)) { \ - MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \ +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \ + type rhsV[8]; \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7) +#else +#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \ + if (PEEL_MMA > peel2) { \ + PacketBlock lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \ + __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \ + if (sizeof(type) == 16) { \ + ploadRhsMMA(reinterpret_cast(rhs_ptr + (accRows * peel1)), prhsV##peel1); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&rhsV[peel1]), &prhsV##peel1); \ + } else { \ + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ + ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV[peel1]); \ + ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV[peel2]); \ + } \ + MICRO_MMA_UNROLL(funcl2) \ + MICRO_MMA_WORK(funcw2, type, peel1) \ + MICRO_MMA_WORK(funcw2, type, peel2) \ } else { \ - MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \ - } \ - rhs_ptr += (accRows * PEEL_MMA); + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ + MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \ + } -#define MICRO_MMA_ONE \ - if (sizeof(Scalar) == sizeof(float)) { \ - MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \ - } else { \ - MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \ - } \ - rhs_ptr += accRows; +#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \ + type rhsV[8]; \ + __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \ + MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \ + MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \ + MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \ + MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,6,7) +#endif + +#define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \ + type rhsV[1]; \ + MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) + +#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \ + MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \ + rhs_ptr += (accRows * size); + +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA) +#else +#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \ + MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \ + rhs_ptr += (accRows * size); + +#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA) +#endif + +#define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1) #define MICRO_MMA_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzeroMMA(&accZero##iter); \ + bsetzeroMMA(&accZero##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accZero##iter); \ } #define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE) -#define MICRO_MMA_SRC_PTR_ONE(iter) \ - if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ - } +#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_SRC_PTR_ONE) -#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE) - -#define MICRO_MMA_PREFETCH_ONE(iter) \ - if (unroll_factor > iter) { \ - EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ - } - -#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE) +#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE) +#ifdef USE_PARTIAL_PACKETS #define MICRO_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeAccumulator(row + iter*accCols, col, res, pAlpha, &accZero##iter); \ + storeAccumulator(row + iter*accCols, res, pAlpha, accCols2, &accZero##iter); \ } +#else +#define MICRO_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + storeAccumulator(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \ + } +#endif #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) -template -EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( +#ifdef USE_PARTIAL_PACKETS +template +#else +template +#endif +EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -252,8 +315,13 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( Index strideA, Index offsetA, Index& row, - Index col, - const Packet& pAlpha) + const Packet& pAlpha, +#ifdef USE_PARTIAL_PACKETS + Index accCols2 +#else + const Packet& pMask +#endif + ) { const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; @@ -262,8 +330,8 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( MICRO_MMA_SRC_PTR MICRO_MMA_DST_PTR - Index k = 0; - for(; k + PEEL_MMA <= depth; k+= PEEL_MMA) + Index k = 0, depth2 = depth - PEEL_MMA; + for(; k <= depth2; k += PEEL_MMA) { EIGEN_POWER_PREFETCH(rhs_ptr); MICRO_MMA_PREFETCH @@ -275,181 +343,245 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( } MICRO_MMA_STORE - row += unroll_factor*accCols; + MICRO_UPDATE } -template +#ifdef USE_PARTIAL_PACKETS +#define MICRO_MMA_UNROLL_ITER2(N, M) \ + gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, M ? remaining_rows : accCols); \ + if (M) return; +#else +#define MICRO_MMA_UNROLL_ITER2(N, M) \ + gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \ + if (M) return; +#endif + +template +EIGEN_ALWAYS_INLINE void gemmMMA_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_MMA_UNROLL 7 + while(row + MAX_MMA_UNROLL*accCols <= rows) { + MICRO_MMA_UNROLL_ITER2(MAX_MMA_UNROLL, 0); + } + switch( (rows-row)/accCols ) { +#if MAX_MMA_UNROLL > 7 + case 7: + MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7) + break; +#endif +#if MAX_MMA_UNROLL > 6 + case 6: + MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6) + break; +#endif +#if MAX_MMA_UNROLL > 5 + case 5: + MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5) + break; +#endif +#if MAX_MMA_UNROLL > 4 + case 4: + MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4) + break; +#endif +#if MAX_MMA_UNROLL > 3 + case 3: + MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3) + break; +#endif +#if MAX_MMA_UNROLL > 2 + case 2: + MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2) + break; +#endif +#if MAX_MMA_UNROLL > 1 + case 1: + MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 1) + break; +#endif + default: + break; + } +#undef MAX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask); + } +} + +template void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; const Packet pAlpha = pset1(alpha); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); + + typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2; Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - - Index row = 0; -#define MAX_MMA_UNROLL 7 - while(row + MAX_MMA_UNROLL*accCols <= rows) { - gemm_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { -#if MAX_MMA_UNROLL > 7 - case 7: - gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 6 - case 6: - gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 5 - case 5: - gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 4 - case 4: - gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 3 - case 3: - gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 2 - case 2: - gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 1 - case 1: - gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif - default: - break; - } -#undef MAX_MMA_UNROLL - - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } + gemmMMA_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); } - if(remaining_cols > 0) + if (col != cols) { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); - - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; - } + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } } -#define accColsC (accCols / 2) #define advanceRows ((LhsIsReal) ? 1 : 2) #define advanceCols ((RhsIsReal) ? 1 : 2) // PEEL_COMPLEX_MMA loop factor. -#define PEEL_COMPLEX_MMA 7 +#define PEEL_COMPLEX_MMA 3 #define MICRO_COMPLEX_MMA_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) + func(0) func(1) func(2) func(3) -#define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \ - if (unroll_factor > iter) { \ - lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ - if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ - } \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhsV##iter); \ - EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ - } +#define MICRO_COMPLEX_MMA_WORK(func, type, peel) \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) #define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \ if (unroll_factor > iter) { \ - pgercMMA(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + pgercMMA(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV[peel], rhsVi[peel]); \ } -#define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \ - if (PEEL_COMPLEX_MMA > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ - if(!RhsIsReal) { \ - ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ +#ifdef VECTOR_PAIR_LOADS_LHS +#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \ + if (unroll_factor > iter) { \ + pgercMMA(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV[peel], rhsVi[peel]); \ + } + +#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) \ + if (!LhsIsReal && (unroll_factor > iter)) { \ + if (MICRO_NORMAL(iter)) { \ + ploadLhsMMA(reinterpret_cast(lhs_ptr_real##iter + imag_delta), plhsVi##iter); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&lhsVi2##iter.packet), &plhsVi##iter); \ } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + lhsVi2##iter.packet[0] = ploadLhs(lhs_ptr_real##iter + imag_delta2); \ + lhsVi2##iter.packet[1] = ploadLhs(lhs_ptr_real##iter + imag_delta2 + accCols2); \ + EIGEN_UNUSED_VARIABLE(plhsVi##iter) \ } \ - MICRO_COMPLEX_MMA_UNROLL(func2); \ - func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \ } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + EIGEN_UNUSED_VARIABLE(lhsVi2##iter); \ + EIGEN_UNUSED_VARIABLE(plhsVi##iter) \ + } \ + MICRO_MMA_LOAD1_TWO(lhs_ptr_real, iter) + +#define MICRO_COMPLEX_MMA_LOAD_TWO(iter) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) +#endif + +#define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \ + if (PEEL_COMPLEX_MMA > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV[peel]); \ + if(!RhsIsReal) { \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi[peel]); \ + } \ + MICRO_COMPLEX_MMA_UNROLL(funcl) \ + MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \ } -#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ - type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9); - -#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \ - type rhsV0, rhsVi0; \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); - -#define MICRO_COMPLEX_MMA_ONE_PEEL \ - if (sizeof(Scalar) == sizeof(float)) { \ - MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \ +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \ + type rhsV[4], rhsVi[4]; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \ + MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3) +#else +#define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \ + if (PEEL_COMPLEX_MMA > peel2) { \ + PacketBlock lhsV20, lhsV21, lhsV22, lhsV23; \ + PacketBlock lhsVi20, lhsVi21, lhsVi22, lhsVi23; \ + __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \ + __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \ + if (sizeof(type) == 16) { \ + ploadRhsMMA(reinterpret_cast(rhs_ptr_real + (accRows * peel1)), prhsV##peel1); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&rhsV[peel1]), &prhsV##peel1); \ + if(!RhsIsReal) { \ + ploadRhsMMA(reinterpret_cast(rhs_ptr_imag + (accRows * peel1)), prhsVi##peel1); \ + __builtin_vsx_disassemble_pair(reinterpret_cast(&rhsVi[peel1]), &prhsVi##peel1); \ + } else { \ + EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ + EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV[peel1]); \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV[peel2]); \ + if(!RhsIsReal) { \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi[peel1]); \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi[peel2]); \ + } \ + } \ + MICRO_COMPLEX_MMA_UNROLL(funcl2) \ + MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \ + MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \ } else { \ - MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \ - } \ - rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \ - if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA); + EIGEN_UNUSED_VARIABLE(prhsV##peel1); \ + EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \ + } -#define MICRO_COMPLEX_MMA_ONE \ - if (sizeof(Scalar) == sizeof(float)) { \ - MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \ - } else { \ - MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \ - } \ - rhs_ptr_real += accRows; \ - if(!RhsIsReal) rhs_ptr_imag += accRows; +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \ + type rhsV[4], rhsVi[4]; \ + __vector_pair prhsV0, prhsV2; \ + __vector_pair prhsVi0, prhsVi2; \ + MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \ + MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) +#endif + +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \ + type rhsV[1], rhsVi[1]; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) + +#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \ + MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \ + rhs_ptr_real += (accRows * size); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * size); + +#ifndef VECTOR_PAIR_LOADS_LHS +#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA) +#else +#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \ + MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \ + rhs_ptr_real += (accRows * size); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * size); + +#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA) +#endif + +#define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1) #define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzeroMMA(&accReal##iter); \ - bsetzeroMMA(&accImag##iter); \ + bsetzeroMMA(&accReal##iter); \ + bsetzeroMMA(&accImag##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accReal##iter); \ EIGEN_UNUSED_VARIABLE(accImag##iter); \ @@ -457,40 +589,19 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE) -#define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \ - if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } +#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) -#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE) - -#define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \ - if (unroll_factor > iter) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ - } - -#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE) +#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeComplexAccumulator(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ + storeComplexAccumulator(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \ } #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -499,27 +610,28 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; + const Index imag_delta2 = accCols2*strideA; if(!RhsIsReal) { rhs_ptr_imag = rhs_base + accRows*strideB; } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_DST_PTR - Index k = 0; - for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA) + Index k = 0, depth2 = depth - PEEL_COMPLEX_MMA; + for(; k <= depth2; k += PEEL_COMPLEX_MMA) { EIGEN_POWER_PREFETCH(rhs_ptr_real); if(!RhsIsReal) { @@ -534,85 +646,98 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( } MICRO_COMPLEX_MMA_STORE - row += unroll_factor*accCols; + MICRO_COMPLEX_UPDATE } -template +#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \ + gemm_complex_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \ + if (M) return; + +template +EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_COMPLEX_MMA_UNROLL 4 + while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { + MICRO_COMPLEX_MMA_UNROLL_ITER2(MAX_COMPLEX_MMA_UNROLL, 0); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_MMA_UNROLL > 4 + case 4: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 4) + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 3 + case 3: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3) + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 2 + case 2: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2) + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 1 + case 1: + MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1) + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } +} + +template void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; const Packet pAlphaReal = pset1(alpha.real()); const Packet pAlphaImag = pset1(alpha.imag()); - const Packet pMask = bmask((const int)(remaining_rows)); + const Packet pMask = bmask(remaining_rows); const Scalar* blockA = (Scalar *) blockAc; const Scalar* blockB = (Scalar *) blockBc; + typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2; + Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_MMA_UNROLL 4 - while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { - gemm_complex_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_MMA_UNROLL > 4 - case 4: - gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 3 - case 3: - gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 2 - case 2: - gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 1 - case 1: - gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_MMA_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemmMMA_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) + if (col != cols) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } } @@ -620,10 +745,13 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS #undef advanceRows #undef advanceCols -#pragma GCC reset_options } // end namespace internal } // end namespace Eigen +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC pop_options +#endif + #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h new file mode 100644 index 0000000..b3e063d --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h @@ -0,0 +1,385 @@ + #ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H + #define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +EIGEN_STRONG_INLINE void pgerMMAbfloat16(__vector_quad* acc, const Packet8bf& a, const Packet8bf& b, int maskX, int maskY) +{ + switch(maskX){ + case 15: + switch(maskY){ + case 0b1111: + __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val)); + break; + case 0b0011: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b1111, 0b11, 0b11); + break; + case 0b0001: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b1111, 0b1, 0b11); + break; + case 0b0111: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b1111, 0b111, 0b11); + break; + } + break; + case 3: + switch(maskY){ + case 0b1111: + __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val)); + break; + case 0b0011: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b11, 0b11, 0b11); + break; + case 0b0001: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b11, 0b1, 0b11); + break; + case 0b0111: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b11, 0b111, 0b11); + break; + } + break; + case 1: + switch(maskY){ + case 0b1111: + __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val)); + break; + case 0b0011: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b1, 0b11, 0b11); + break; + case 0b0001: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b1, 0b1, 0b11); + break; + case 0b0111: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b1, 0b111, 0b11); + break; + } + break; + case 0b0111: + switch(maskY){ + case 0b1111: + __builtin_mma_xvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val)); + break; + case 0b0011: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b111, 0b11, 0b11); + break; + case 0b0001: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b111, 0b1, 0b11); + break; + case 0b0111: + __builtin_mma_pmxvbf16ger2pp(acc, reinterpret_cast(a.m_val), reinterpret_cast(b.m_val), 0b111, 0b111, 0b11); + break; + } + break; + } +} + +EIGEN_STRONG_INLINE void scaleAndStore(float* result, float* acc, Packet4f pAlpha) +{ + Packet4f result_block = ploadu(result); + Packet4f packet_pmadd = pmadd(pload(acc), pAlpha, result_block); + pstoreu(result, packet_pmadd); +} + +template +EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16(const bfloat16* indexA) +{ + Packet8bf lhs1 = ploadu(indexA); + Packet8bf lhs2; + const int packet_size = 8; //We fit 8 bfloat16 on a 128 register + if(zero){ + lhs2 = pset1(Eigen::bfloat16(0)); + } + else lhs2 = ploadu(indexA + num_packets*packet_size); + return vec_mergeh(lhs1.m_val, lhs2.m_val); +} + +template +EIGEN_STRONG_INLINE Packet8bf loadLhsBfloat16ExtraRows(const bfloat16* indexA, Index strideA, Index row, int extra_rows) +{ + EIGEN_ALIGN16 bfloat16 lhs_array[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)}; + int count = 0; + const bfloat16* idxA = indexA + row*strideA; + for(int row_count = 0; row_count < extra_rows; row_count++){ + lhs_array[count++] = *idxA; + if(!zero) lhs_array[count] = *(idxA+1); + count++; + idxA += strideA; + } + return pload(lhs_array); +} + +template +EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16(const bfloat16* baseB, Index strideB, int i, int k) +{ + const bfloat16* indexB = baseB + strideB*4*i + (k*4); + Packet8bf rhs1 = ploadu(indexB); + if(zero){ + Packet8bf rhs2 = pset1(Eigen::bfloat16(0)); + return vec_mergeh(rhs1.m_val, rhs2.m_val); + } + //r = vec_perm (a, b, c) + //Let v be the concatenation of a and b. + //Each byte of r selected by using the least-significant 5 bits of the corresponding byte of c as an index into v + //We need this elements from rhs: 0, 4, 1, 5, 2, 6, 3, 7 + Packet16uc c = {0x0u, 0x1u, 0x8u, 0x9u, 0x2u, 0x3u, 0xAu, 0xB, 0x4, 0x5, 0xCu, 0xDu, 0x6u, 0x7u, 0xEu, 0xFu}; + return vec_perm(rhs1.m_val, rhs1.m_val, c); +} + +template +EIGEN_STRONG_INLINE Packet8bf loadRhsBfloat16ExtraCols(const bfloat16* blockB, Index strideB, Index offsetB, Index col, int i, int k, int extra_cols) +{ + EIGEN_ALIGN16 bfloat16 rhs_vector[8] = {Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0), Eigen::bfloat16(0)}; + const bfloat16* indexB = blockB + ((col+4*i)*strideB)+k+offsetB; + for(int c = 0; c < extra_cols; c++){ + rhs_vector[2*c] = *indexB; + if(!zero) rhs_vector[2*c+1] = *(indexB+1); + indexB += strideB; + } + return pload(rhs_vector); +} + +template +EIGEN_STRONG_INLINE void KLoop +( + const bfloat16* indexA, + const bfloat16* indexB, + __vector_quad (&quad_acc)[num_acc], + Index strideA, + Index strideB, + Index offsetB, + Index k, + Index row, + Index col, + int extra_rows, + int extra_cols, + int mask_rows = 0xF, + int mask_cols = 0xF +) +{ + Packet8bf lhs; + Packet8bf rhs[num_acc]; + if(lhs_extra_rows) lhs = loadLhsBfloat16ExtraRows(indexA+k, strideA, row, extra_rows); + else lhs = loadLhsBfloat16(indexA + k*num_packets*8); //a packet of bfloat16 has 8 elements + for(int i = 0; i < num_acc; i++){ + if(!rhs_extra_cols) + rhs[i] = loadRhsBfloat16(indexB, strideB, i, k); + else{ + rhs[i] = loadRhsBfloat16ExtraCols(indexB, strideB, offsetB, col, i, k, extra_cols); + } + pgerMMAbfloat16(&(quad_acc[i]), rhs[i], lhs, mask_cols, mask_rows); + } +} + +template +void colLoopBody(Index* p_col, Index row, Index depth, Index cols, Index rows, int offset_row, int block_index, Packet4f pAlpha, const bfloat16* indexA, Index strideA, const bfloat16* blockB, Index strideB, Index offsetB, float* result, int extra_cols = 0, int extra_rows = 0, int mask_cols = 0xF, int mask_rows = 0xF) +{ + int col = *p_col; + int count; + int max, step, bound; + const bfloat16* indexB; + + if(num_acc == 1) bound = 0; + else bound = 1; + + if(rhsExtraCols){ + count = 0; + max = 1; + step = 1; + indexB = blockB; + } + else{ + count = col; + step = num_acc * 4; //each accumulator has 4 elements + max = cols/step; + indexB = blockB + 4*offsetB + strideB*col; + } + + while(count/step + bound < max){ + Index k = 0; + EIGEN_ALIGN32 float acc[num_acc][4][4]; + __vector_quad quad_acc[num_acc]; + + for(int i = 0; i < num_acc; i++) + __builtin_mma_xxsetaccz(&(quad_acc[i])); + + if(depth%2 != 0){ + KLoop(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols); + k = 1; + } + for(; k/2 < depth/2; k += 2){ + KLoop(indexA, indexB, quad_acc, strideA, strideB, offsetB, k, row, col, extra_rows, extra_cols, mask_rows, mask_cols); + } + for(int i = 0; i < num_acc; i++){ + __builtin_mma_disassemble_acc((void*)acc[i], &(quad_acc[i])); + if(lhsExtraRows){ + for(int x = 0; x < extra_cols; x++){ + for(int y = 0; y < extra_rows; y++){ + result[((col+i*4)+x)*rows + row + y] += acc[i][x][y]*(pAlpha[0]); + } + } + } + else{ + if(rhsExtraCols){ + for(int x = 0; x < cols-col; x++){ + scaleAndStore(result + ((col+i*4)+x)*rows + row + offset_row,acc[i][x], pAlpha); + } + } + else{ + for(int x = 0; x < 4; x++){ + scaleAndStore(result + ((col+i*4)+x)*rows + (block_index*16) + offset_row,acc[i][x], pAlpha); + } + } + } + } + count += step; + if(!rhsExtraCols) { + indexB += strideB*step; + col += step; + } + } + *p_col = col; +} + +template +void gemmMMAbfloat16(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + + if(rows == 0 || cols == 0 || depth == 0) return; + const Packet4f pAlpha = pset1(Eigen::bfloat16_impl::bfloat16_to_float(alpha)); + ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0); + + for(int j = 0; j < cols; j++){ + for(int i = 0; i < rows; i++){ + result[j*rows + i] = res(i,j); + } + } + + Index row = 0; + Index col = 0; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + //Packing is done in blocks. + //There's 3 possible sizes of blocks + //Blocks of 8 columns with 16 elements (8x16) as col major + //Blocks of 8 columns with 8 elements (8x8) as col major. This happens when there's 16 > rows > 8 + //Blocks of 8 columns with <8 elements as row major. This happens when there's less than 8 remaining rows + + //Loop for LHS standard block (8x16) + int standard_block_size = 16; + const int standard_blocks_quantity = rows/standard_block_size; //Number of standard blocks + int bigSuffix = (2*8) * (strideA-offsetA-depth); + const bfloat16* indexA = blockA; + int block_index; + for(block_index = 0; block_index < standard_blocks_quantity; block_index++){ + indexA += 2*8*offsetA; + for(int offset_row = 0; offset_row < standard_block_size; offset_row += 4){ //This block size has 16 rows maximum + col = 0; + colLoopBody<5, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<4, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<3, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<2, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<1, 16, 2>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + if(cols > col){ + int extra_cols= cols-col; + int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0; + int mask_cols= 0xF >> shift; + //Remember: It doesnt make sense use multiple acc to extra_cols as we are unrolling col loop + colLoopBody<1, 16, 2, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF); + } + } + row += 16; + indexA += bigSuffix + 2*8*depth; + } + //LHS (8x8) block + if(rows - standard_blocks_quantity*16 >= 8){ + indexA += 1*8*offsetA + 2*8*offsetA; + for(int offset_row = 0; offset_row < 8; offset_row += 4){ + col = 0; + colLoopBody<5, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<4, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<3, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<2, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + colLoopBody<1, 8, 1>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result); + } + if(cols > col){ + int extra_cols= cols-col; + int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0; + int mask_cols= 0xF >> shift; + + for(int offset_row = 0; offset_row < 8; offset_row += 4){ + colLoopBody<1, 8, 1, true>(&col, row, depth, cols, rows, offset_row, block_index, pAlpha, indexA+offset_row, strideA, blockB, strideB, offsetB, result, extra_cols, 4, mask_cols, 0xF); + } + } //end extra cols + row += 8; + } + //extra rows + while(row < rows){ + int extra_rows = rows-row; + int shift = (4-extra_rows >= 0) ? 4-extra_rows : 0; + int mask_rows = 0xF >> shift; + int extra_rows_or_four = (extra_rows <= 4) ? extra_rows : 4; + + //This index is the beginning of remaining block. + //This last block for LHS is organized as RowMajor + col = 0; + colLoopBody<5, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows); + colLoopBody<4, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows); + colLoopBody<3, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows); + colLoopBody<2, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows); + colLoopBody<1, 8, 1, false, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, 4, extra_rows_or_four, 0xF, mask_rows); + if(cols > col){ + int extra_cols= cols-col; + int shift = (4-extra_cols>= 0) ? 4-extra_cols: 0; + int mask_cols= 0xF >> shift; + int extra_cols_or_four = (extra_cols <= 4) ? extra_cols : 4; + + colLoopBody<1, 8, 1, true, true>(&col, row, depth, cols, rows, 0, block_index, pAlpha, blockA, strideA, blockB, strideB, offsetB, result, extra_cols_or_four, extra_rows_or_four, mask_cols, mask_rows); + } + row += extra_rows_or_four; + } + + //Convert back to bfloat16 + for(col = 0; col/4 < cols/4; col += 4){ + int row; + for(row = 0; row/8 < rows/8; row += 8){ + //get and save block + PacketBlock block; + for(int j = 0; j < 4; j++){ + Packet4f temp_even, temp_odd; + EIGEN_ALIGN32 float even[4], odd[4]; + for(int i = 0; i < 4; i++){ + even[i] = result[(col + j)*rows + row + i*2]; + odd[i] = result[(col + j)*rows + row + i*2+1]; + } + temp_even = pload(even); + temp_odd = pload(odd); + block.packet[j] = F32ToBf16(temp_even, temp_odd); + } + + res.template storePacketBlock(row, col, block); + } + //extra rows + while(row < rows){ + for(int col_off = 0; col_off < 4; col_off++){ + res(row, col+col_off) = Eigen::bfloat16(result[(col+col_off)*rows+row]); + } + row++; + } + + } + //extra cols + while(col < cols){ + for(int r = 0; r < rows; r++){ + res(r, col) = Eigen::bfloat16(result[col*rows + r]); + } + col++; + } +} + + +} +} +#endif //EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h new file mode 100644 index 0000000..9d00b93 --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h @@ -0,0 +1,2400 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H +#define EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H + +#include "../../InternalHeaderCheck.h" + +#if defined(__MMA__) && !EIGEN_ALTIVEC_DISABLE_MMA +#if EIGEN_COMP_LLVM || (__GNUC__ > 10 || __GNUC_MINOR__ >= 3) +#define USE_GEMV_MMA +#endif + +#if !EIGEN_COMP_LLVM && (__GNUC__ == 10 && __GNUC_MINOR__ <= 3) +// Only allow one vector_pair in buggy gcc - gcc 10.3 has a bug +#define GCC_ONE_VECTORPAIR_BUG +#endif +#endif + +//#define USE_SLOWER_GEMV_MMA // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is improved) + +//#define EIGEN_POWER_USE_GEMV_PREFETCH +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH +#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p) +#else +#define EIGEN_POWER_GEMV_PREFETCH(p) +#endif + +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif +#endif + +#if EIGEN_COMP_LLVM +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1) +#else +#if (__GNUC__ <= 10) +#if (__GNUC_MINOR__ > 3) +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1) +#else +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2) +#endif +#else +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_build_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2) +#endif +#endif + +#define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16)) +#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float))) +#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16) +#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex))) + +/** \internal multiply and add and store results */ +template +EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data) +{ + pstoreu(res, pmadd(data, palpha, ploadu(res))); +} + +template +EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data) +{ + *res += (alpha * data); +} + +#define GEMV_UNROLL(func, N) \ + func(0, N) func(1, N) func(2, N) func(3, N) \ + func(4, N) func(5, N) func(6, N) func(7, N) + +#define GEMV_UNROLL_HALF(func, N) \ + func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N) + +#define GEMV_GETN(N) (((N) * ResPacketSize) >> 2) + +#define GEMV_LOADPACKET_COL(iter) \ + lhs.template load(i + ((iter) * LhsPacketSize), j) + +#ifdef USE_GEMV_MMA +#define GEMV_UNROLL3(func, N, which) \ + func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \ + func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which) + +#define GEMV_UNUSED_VAR(iter, N, which) \ + if (GEMV_GETN(N) <= iter) { \ + EIGEN_UNUSED_VARIABLE(which##iter); \ + } + +#define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \ + if (N <= iter) { \ + EIGEN_UNUSED_VARIABLE(which##iter); \ + } + +#define GEMV_UNUSED_EXTRA(N, which) \ + GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which) + +#define GEMV_UNUSED(N, which) \ + GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which) + +#define GEMV_INIT_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_xxsetaccz(&e##iter); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1)); +#else +#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \ + const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \ + b##iter1 = *reinterpret_cast<__vector_pair *>(const_cast(&src##iter1)); +#endif + +#define GEMV_LOAD1A_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + g##iter = GEMV_LOADPACKET_COL(iter); \ + EIGEN_UNUSED_VARIABLE(b##iter); \ + } else { \ + GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \ + EIGEN_UNUSED_VARIABLE(g##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter); \ + EIGEN_UNUSED_VARIABLE(g##iter); \ + } + +#define GEMV_WORK1A_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + pger_vecMMA_acc(&e##iter, a0, g##iter); \ + } else { \ + pger_vecMMA_acc(&e##iter, b##iter, a0); \ + } \ + } + +#define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_LOADPAIR_COL_MMA(iter2, iter2) \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } else { \ + GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \ + GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } \ + EIGEN_UNUSED_VARIABLE(g##iter2); \ + EIGEN_UNUSED_VARIABLE(g##iter3); + +#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + LhsPacket h[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(h), &b##iter2); \ + pger_vecMMA_acc(&e##iter2, a0, h[0]); \ + pger_vecMMA_acc(&e##iter3, a0, h[1]); \ + } else { \ + pger_vecMMA_acc(&e##iter2, b##iter2, a0); \ + pger_vecMMA_acc(&e##iter3, b##iter3, a0); \ + } \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOAD_COL_MMA(N) \ + if (GEMV_GETN(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \ + } + +#define GEMV_WORK_COL_MMA(N) \ + if (GEMV_GETN(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \ + } +#else +#define GEMV_LOAD_COL_MMA(N) \ + GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) + +#define GEMV_WORK_COL_MMA(N) \ + GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) +#endif + +#define GEMV_DISASSEMBLE_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \ + if (!GEMV_IS_FLOAT) { \ + result##iter.packet[0][1] = result##iter.packet[1][0]; \ + result##iter.packet[2][1] = result##iter.packet[3][0]; \ + } \ + } + +#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \ + b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)); + +#define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } else { \ + GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \ + GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \ + ResPacket f##iter2[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(f##iter2), &b##iter2); \ + f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \ + f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \ + GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]); +#else +#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \ + if (GEMV_IS_FLOAT) { \ + __asm__ ("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter3.packet[0]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \ + } else { \ + __asm__ ("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter2.packet[2]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \ + } +#endif + +#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \ + } else { \ + GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \ + GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \ + } \ + } + +#define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \ + *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)) = b##iter1; + +#define GEMV_STORE_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + storeMaddData(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \ + } else { \ + GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \ + GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \ + GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \ + } \ + } + +#define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \ + } else { \ + GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \ + GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \ + } \ + } + +#define GEMV_PROCESS_COL_ONE_MMA(N) \ + GEMV_UNROLL(GEMV_INIT_MMA, N) \ + Index j = j2; \ + __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \ + do { \ + LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \ + RhsPacket a0 = pset1(rhs2(j, 0)); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_LOAD_COL_MMA(N) \ + GEMV_WORK_COL_MMA(N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \ + if (GEMV_GETN(N) <= 1) { \ + GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \ + } else { \ + GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \ + GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \ + GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \ + } \ + i += (ResPacketSize * N); +#endif + +#define GEMV_INIT(iter, N) \ + if (N > iter) { \ + c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ + } + +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH +#define GEMV_PREFETCH(iter, N) \ + if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \ + lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \ + } +#else +#define GEMV_PREFETCH(iter, N) +#endif + +#define GEMV_WORK_COL(iter, N) \ + if (N > iter) { \ + c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \ + } + +#define GEMV_STORE_COL(iter, N) \ + if (N > iter) { \ + pstoreu(res + i + (iter * ResPacketSize), pmadd(c##iter, palpha, ploadu(res + i + (iter * ResPacketSize)))); \ + } + +/** \internal main macro for gemv_col - initialize accumulators, multiply and add inputs, and store results */ +#define GEMV_PROCESS_COL_ONE(N) \ + GEMV_UNROLL(GEMV_INIT, N) \ + Index j = j2; \ + do { \ + RhsPacket a0 = pset1(rhs2(j, 0)); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_UNROLL(GEMV_WORK_COL, N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_STORE_COL, N) \ + i += (ResPacketSize * N); + +#ifdef USE_GEMV_MMA +#define GEMV_PROCESS_COL(N) \ + GEMV_PROCESS_COL_ONE_MMA(N) +#else +#define GEMV_PROCESS_COL(N) \ + GEMV_PROCESS_COL_ONE(N) +#endif + +/** \internal perform a matrix multiply and accumulate of packet a and packet b */ +#ifdef USE_GEMV_MMA +template +EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +{ + if (accumulate) + { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } + else + { + __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +/** \internal perform a matrix multiply and accumulate of vector_pair a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b) +{ + if (accumulate) + { + __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b); + } + else + { + __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b); + } +} +#endif + +template +EIGEN_STRONG_INLINE void gemv_col( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr == 1); + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + RhsMapper rhs2(rhs); + + conj_helper cj; + conj_helper pcj; + + const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + RhsPacketSize = Traits::RhsPacketSize, + }; + +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = rows - 8 * ResPacketSize + 1; + const Index n4 = rows - 4 * ResPacketSize + 1; + const Index n2 = rows - 2 * ResPacketSize + 1; +#endif + const Index n1 = rows - 1 * ResPacketSize + 1; +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH + const Index prefetch_dist = 64 * LhsPacketSize; +#endif + + // TODO: improve the following heuristic: + const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8); + ResPacket palpha = pset1(alpha); + + for (Index j2 = 0; j2 < cols; j2 += block_cols) + { + Index jend = numext::mini(j2 + block_cols, cols); + Index i = 0; + ResPacket c0, c1, c2, c3, c4, c5, c6, c7; +#ifdef USE_GEMV_MMA + __vector_quad e0, e1, e2, e3, e4, e5, e6, e7; + PacketBlock result0, result1, result2, result3, result4, result5, result6, result7; + GEMV_UNUSED(8, e) + GEMV_UNUSED(8, result) + GEMV_UNUSED_EXTRA(1, c) +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + while (i < n8) + { + GEMV_PROCESS_COL(8) + } + if (i < n4) + { + GEMV_PROCESS_COL(4) + } + if (i < n2) + { + GEMV_PROCESS_COL(2) + } + if (i < n1) +#else + while (i < n1) +#endif + { + GEMV_PROCESS_COL_ONE(1) + } + for (;i < rows;++i) + { + ResScalar d0(0); + Index j = j2; + do { + d0 += cj.pmul(lhs(i, j), rhs2(j, 0)); + } while (++j < jend); + res[i] += alpha * d0; + } + } +} + +const Packet16uc p16uc_COMPLEX32_XORFLIP = { 0x44,0x55,0x66,0x77, 0x00,0x11,0x22,0x33, 0xcc,0xdd,0xee,0xff, 0x88,0x99,0xaa,0xbb }; +const Packet16uc p16uc_COMPLEX64_XORFLIP = { 0x88,0x99,0xaa,0xbb, 0xcc,0xdd,0xee,0xff, 0x00,0x11,0x22,0x33, 0x44,0x55,0x66,0x77 }; + +#ifdef _BIG_ENDIAN +const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +#else +const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +#endif + +#ifdef _BIG_ENDIAN +#define COMPLEX_DELTA 0 +#else +#define COMPLEX_DELTA 2 +#endif + +/** \internal packet conjugate (same as pconj but uses the constants in pcplxflipconj for better code generation) */ +EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf& a) { + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_CONJ_XOR))); +} + +EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) { + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_CONJ_XOR))); +} + +/** \internal packet conjugate with real & imaginary operation inverted */ +EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) { +#ifdef __POWER8_VECTOR__ + return Packet2cf(Packet4f(vec_neg(Packet2d(a.v)))); +#else + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_CONJ_XOR2))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) { + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_CONJ_XOR2))); +} + +#if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12) +#define PERMXOR_GOOD // Clang had a bug with vec_permxor and endianness prior to version 12 +#endif + +/** \internal flip the real & imaginary results and packet conjugate */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP))); +#else + return pcplxflip(pconj2(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP))); +#else + return pcplxflip(pconj2(a)); +#endif +} + +/** \internal packet conjugate and flip the real & imaginary results */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP))); +#else + return pconj2(pcplxflip(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP))); +#else + return pconj2(pcplxflip(a)); +#endif +} + +/** \internal packet negate */ +EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a) +{ +#ifdef __POWER8_VECTOR__ + return Packet2cf(vec_neg(a.v)); +#else + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_NEGATE))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a) +{ +#ifdef __POWER8_VECTOR__ + return Packet1cd(vec_neg(a.v)); +#else + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_NEGATE))); +#endif +} + +/** \internal flip the real & imaginary results and negate */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP))); +#else + return pcplxflip(pnegate2(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP))); +#else + return pcplxflip(pnegate2(a)); +#endif +} + +/** \internal flip the real & imaginary results */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a) +{ + return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP))); +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a) +{ +#ifdef __VSX__ + return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2)); +#else + return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP))); +#endif +} + +/** \internal load half a vector with one complex value */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex* src) +{ + Packet4f t; +#ifdef __VSX__ + // Load float64/two float32 (doubleword alignment) + __asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src)); +#else + *reinterpret_cast*>(reinterpret_cast(&t) + COMPLEX_DELTA) = *src; +#endif + return t; +} + +/** \internal load two vectors from the real and imaginary portions of a complex value */ +template +EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i) +{ +#ifdef _ARCH_PWR9 + __asm__("lxvwsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast(src) + 0))); + __asm__("lxvwsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast(src) + 1))); +#else + Packet4f t = pload_complex_half(src); + r = vec_splat(t, COMPLEX_DELTA + 0); + i = vec_splat(t, COMPLEX_DELTA + 1); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i) +{ +#ifdef __VSX__ + __asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast(src) + 0))); + __asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast(src) + 1))); +#else + Packet2d t = ploadu(reinterpret_cast(src)); + r = vec_splat(t, 0); + i = vec_splat(t, 1); +#endif +} + +#ifndef __POWER8_VECTOR__ +const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B }; + +const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +#endif + +/** \internal load two vectors from the interleaved real & imaginary values of src */ +template +EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i) +{ + Packet4f t = ploadu(reinterpret_cast(src)); +#ifdef __POWER8_VECTOR__ + r = vec_mergee(t, t); + i = vec_mergeo(t, t); +#else + r = vec_perm(t, t, p16uc_MERGEE); + i = vec_perm(t, t, p16uc_MERGEO); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i) +{ + return pload_realimag(src, r, i); +} + +/** \internal load and splat a complex value into a vector - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex* src) +{ +#ifdef __VSX__ + Packet4f ret; + __asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast(src) + 0))); + return ret; +#else + return Packet4f(ploaddup(reinterpret_cast(src))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a complex value into a vector - row-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex* src) +{ + return ploadu(src).v; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a scalar or a vector from complex location */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex* src) +{ + if (GEMV_IS_SCALAR) { + return pload_complex_half(src); + } + else + { + return ploadu(reinterpret_cast(src)); + } +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex* src) +{ + return ploadu(reinterpret_cast(src)); +} + +/** \internal load from a complex vector and convert to a real vector */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src) +{ + return src->v; +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src) +{ + return src->v; +} + +/** \internal load a full vector from complex location - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex* src) +{ + return Packet4f(ploaddup(reinterpret_cast(src))); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a full vector from complex location - row-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex* src) +{ + return ploadu(src).v; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex* src) +{ + return pload_complex_full(src); +} + +/** \internal load a vector from a real-only scalar location - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src) +{ + return pset1(*src); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src) +{ + return pset1(*src); +} + +EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src) +{ + return src; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src) +{ + return src; +} + +/** \internal load a vector from a real-only vector location */ +EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src) +{ + Packet4f ret = ploadu(src); + return vec_mergeh(ret, ret); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src) +{ + return pload_real(src); +} + +EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex* src) +{ + return pload_complex_full(src); // Just for compilation +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex* src) +{ + return pload_complex_full(src); // Just for compilation +} + +/** \internal load a vector from a real-only scalar location - row-wise */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src) +{ + if (GEMV_IS_SCALAR) { + return pload_real_full(src); + } + else { + return ploadu(src); + } +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src) +{ + return pload_real(src); +} + +EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex& b) +{ + EIGEN_UNUSED_VARIABLE(b); + return a; // Just for compilation +} + +EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex& b) +{ + EIGEN_UNUSED_VARIABLE(b); + return a; // Just for compilation +} + +/** \internal set a scalar from complex location */ +template +EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj) +{ + return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag()); +} + +/** \internal set a vector from complex location */ +template +EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex& alpha) +{ + Packet2cf ret; + ret.v[COMPLEX_DELTA + 0] = pset1_realimag(alpha, (which & 0x01), (which & 0x04)); + ret.v[COMPLEX_DELTA + 1] = pset1_realimag(alpha, (which & 0x02), (which & 0x08)); + ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0]; + ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1]; + return ret; +} + +template +EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex& alpha) +{ + Packet1cd ret; + ret.v[0] = pset1_realimag(alpha, (which & 0x01), (which & 0x04)); + ret.v[1] = pset1_realimag(alpha, (which & 0x02), (which & 0x08)); + return ret; +} + +/** \internal zero out a vector for real or complex forms */ +template +EIGEN_ALWAYS_INLINE Packet pset_zero() +{ + return pset1(__UNPACK_TYPE__(Packet)(0)); +} + +template<> +EIGEN_ALWAYS_INLINE Packet2cf pset_zero() +{ + return Packet2cf(pset1(float(0))); +} + +template<> +EIGEN_ALWAYS_INLINE Packet1cd pset_zero() +{ + return Packet1cd(pset1(double(0))); +} + +/** \internal initialize a vector from another vector */ +template +EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1) +{ + if (GEMV_IS_COMPLEX_COMPLEX) { + EIGEN_UNUSED_VARIABLE(c1); + return pset_zero(); + } + else + { + return c1; // Intentionally left uninitialized + } +} + +template +struct alpha_store +{ + alpha_store(ResScalar& alpha) { + separate.r = pset1_complex(alpha); + separate.i = pset1_complex(alpha); + } + struct ri { + PResPacket r; + PResPacket i; + } separate; +}; + +/** \internal multiply and add for complex math */ +template +EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0) +{ + return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4)); +} + +/** \internal store and madd for complex math */ +template +EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res) +{ + PResPacket c2 = pcplxflipconj(c0); + if (GEMV_IS_SCALAR) { + ScalarPacket c4 = ploadu(reinterpret_cast(res)); + ScalarPacket c3 = pmadd_complex(c0.v, c2.v, c4, b0); + pstoreu(reinterpret_cast(res), c3); + } else { + ScalarPacket c4 = pload_complex(res); + PResPacket c3 = PResPacket(pmadd_complex(c0.v, c2.v, c4, b0)); + pstoreu(res, c3); + } +} + +template +EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res) +{ + PResPacket c2 = pcplxflipconj(c0); + PResPacket c3 = pcplxflipconj(c1); +#if !defined(_ARCH_PWR10) + ScalarPacket c4 = pload_complex(res + (iter2 * ResPacketSize)); + ScalarPacket c5 = pload_complex(res + ((iter2 + 1) * ResPacketSize)); + PResPacket c6 = PResPacket(pmadd_complex(c0.v, c2.v, c4, b0)); + PResPacket c7 = PResPacket(pmadd_complex(c1.v, c3.v, c5, b0)); + pstoreu(res + (iter2 * ResPacketSize), c6); + pstoreu(res + ((iter2 + 1) * ResPacketSize), c7); +#else + __vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)); +#if EIGEN_COMP_LLVM + PResPacket c6[2]; + __builtin_vsx_disassemble_pair(reinterpret_cast(c6), &a); + c6[0] = PResPacket(pmadd_complex(c0.v, c2.v, c6[0].v, b0)); + c6[1] = PResPacket(pmadd_complex(c1.v, c3.v, c6[1].v, b0)); + GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v); +#else + if (GEMV_IS_COMPLEX_FLOAT) { + __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v)); + __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v)); + } else { + __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v)); + __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v)); + } +#endif + *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a; +#endif +} + +/** \internal load lhs packet */ +template +EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j) +{ + if (sizeof(Scalar) == sizeof(LhsScalar)) { + const LhsScalar& src = lhs(i + 0, j); + return LhsPacket(pload_real_full(const_cast(&src))); + } + return lhs.template load(i + 0, j); +} + +/** \internal madd for complex times complex */ +template +EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c) +{ + if (ConjugateLhs && ConjugateRhs) { + return vec_madd(a, pconj2(ComplexPacket(b)).v, c); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return vec_nmsub(a, b, c); + } + else { + return vec_madd(a, b, c); + } +} + +/** \internal madd for complex times real */ +template +EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c) +{ + if (Conjugate) { + return vec_madd(a, pconj2(ComplexPacket(b)).v, c); + } + else { + return vec_madd(a, b, c); + } +} + +template +EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + conj_helper pcj; + RhsPacket b0; + if (StorageOrder == ColMajor) { + b0 = pset1(*b); + } + else { + b0 = ploadu(b); + } + c0 = pcj.pmadd(a0, b0, c0); +} + +/** \internal core multiply operation for vectors - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1) +{ + ScalarPacket br, bi; + if (StorageOrder == ColMajor) { + pload_realimag(b, br, bi); + } + else { + pload_realimag_row(b, br, bi); + } + if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0); + LhsPacket a1 = pcplxflipconj(a0); + ScalarPacket cr = pmadd_complex_complex(a0.v, br, c0.v); + ScalarPacket ci = pmadd_complex_complex(a1.v, bi, c1.v); + c1 = ResPacket(ci); + c0 = PResPacket(cr); +} + +/** \internal core multiply operation for vectors - real times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_complex_full(b); + } + else { + b0 = pload_complex_full_row(b); + } + ScalarPacket cri = pmadd_complex_real(a0, b0, c0.v); + c0 = PResPacket(cri); +} + +/** \internal core multiply operation for vectors - complex times real */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + ScalarPacket a1 = pload_complex(&a0); + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_real(b); + } + else { + b0 = pload_real_row(b); + } + ScalarPacket cri = pmadd_complex_real(a1, b0, c0.v); + c0 = PResPacket(cri); +} + +#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) \ +{ \ + gemv_mult_complex_complex(a0, b, c0, c1); \ +} + +GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex, Packet2cf) +GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex, Packet1cd) + +#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) \ +{ \ + gemv_mult_real_complex(a0, b, c0); \ +} + +GEMV_MULT_REAL_COMPLEX(float, std::complex, Packet2cf) +GEMV_MULT_REAL_COMPLEX(double, std::complex, Packet1cd) +GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex, Packet2cf) +GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex, Packet1cd) + +#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) \ +{ \ + gemv_mult_complex_real(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex) +GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex) +GEMV_MULT_COMPLEX_REAL(std::complex, float, Packet2cf, std::complex) +GEMV_MULT_COMPLEX_REAL(std::complex, double, Packet1cd, std::complex) + +#ifdef USE_GEMV_MMA +/** \internal convert packet to real form */ +template +EIGEN_ALWAYS_INLINE T convertReal(T a) +{ + return a; +} + +EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a) +{ + return a.v; +} + +EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a) +{ + return a.v; +} + +/** \internal convert packet to complex form */ +template +EIGEN_ALWAYS_INLINE T convertComplex(T a) +{ + return a; +} + +EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a) +{ + return Packet2cf(a); +} + +EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a) +{ + return Packet1cd(a); +} + +/** \internal load a vector from a complex location (for MMA version) */ +template +EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a) +{ + a = SLhsPacket(pload_complex(&a)); +} + +template +EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&) +{ + // Pass thru +} + +/** \internal perform a matrix multiply and accumulate (positive and negative) of packet a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b) +{ + if (NegativeAccumulate) + { + __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } + else { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +/** \internal perform a matrix multiply and accumulate (positive and negative) of vector_pair a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b) +{ + if (NegativeAccumulate) + { + __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b); + } + else { + __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&) +{ + // Just for compilation +} + +/** \internal madd for complex times complex (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) +{ + if (ConjugateLhs && ConjugateRhs) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, b2, a.v); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return pger_vecMMA(c, b, a.v); + } + else { + return pger_vecMMA(c, b, a.v); + } +} + +template +EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) +{ + if (ConjugateLhs && ConjugateRhs) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, a, b2); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return pger_vecMMA(c, a, b); + } + else { + return pger_vecMMA(c, a, b); + } +} + +/** \internal madd for complex times real (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) +{ + RealPacket a2 = convertReal(a); + if (Conjugate) { + RealPacket b2 = pconj2(convertComplex(b)).v; + if (StorageOrder == ColMajor) { + return pger_vecMMA(c, b2, a2); + } else { + return pger_vecMMA(c, a2, b2); + } + } + else { + if (StorageOrder == ColMajor) { + return pger_vecMMA(c, b, a2); + } else { + return pger_vecMMA(c, a2, b); + } + } +} + +/** \internal madd for real times complex (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) +{ + if (Conjugate) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, a, b2); + } + else { + return pger_vecMMA(c, a, b); + } +} + +/** \internal core multiply operation for vectors (MMA version) - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_realimag_combine(b); + } else { + b0 = pload_realimag_combine_row(b); + } + pmadd_complex_complex_MMA(a0, b0, c0); +} + +/** \internal core multiply operation for vectors (MMA version) - complex times real */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + pload_complex_MMA(a0); + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_real(b); + } + else { + b0 = pload_real_row(b); + } + pmadd_complex_real_MMA(a0, b0, c0); +} + +/** \internal core multiply operation for vectors (MMA version) - real times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_complex_full(b); + } + else { + b0 = pload_complex_full_row(b); + } + pmadd_complex_real_MMA)) ? StorageOrder : ColMajor>(a0, b0, c0); +} + +#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_complex_complex_MMA(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex) +GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex) +GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex) + +/** \internal core multiply operation for vectors (MMA version) - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex* b, __vector_quad* c0) +{ + if (sizeof(LhsScalar) == 16) { + gemv_mult_complex_complex_MMA(a0, b, c0); + } + else { + gemv_mult_real_complex_MMA(a0, b, c0); + } +} + +#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_real_complex_MMA(a0, b, c0); \ +} + +GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex) +GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex) + +#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_complex_real_MMA(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float) +GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double) + +/** \internal disassemble MMA accumulator results into packets */ +template +EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock& result0) +{ + __builtin_mma_disassemble_acc(&result0.packet, c0); + if (sizeof(LhsPacket) == 16) { + if (sizeof(RhsPacket) == 16) { + ScalarPacket tmp0, tmp2; + tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]); + tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]); + result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]); + result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]); + result0.packet[2] = tmp2; + result0.packet[0] = tmp0; + + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v; + } else if (ConjugateRhs) { + result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v; + } else { + result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v; + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]); + } else { + result0.packet[0][1] = result0.packet[1][1]; + result0.packet[2][1] = result0.packet[3][1]; + } + } +} + +template +EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock& result0) +{ + __builtin_mma_disassemble_acc(&result0.packet, c0); + if (GEMV_IS_COMPLEX_COMPLEX) { + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v; + } else { + if (ConjugateRhs) { + result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v; + } else { + result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v; + } + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + } else if (sizeof(LhsPacket) == sizeof(std::complex)) { + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + } + } else { + result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]); + } +} + +template +EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock& result0) +{ + if (!GEMV_IS_COMPLEX_FLOAT) { + disassembleResults2(c0, result0); + } else { + disassembleResults4(c0, result0); + } +} +#endif + +#define GEMV_GETN_COMPLEX(N) (((N) * ResPacketSize) >> 1) + +#define GEMV_LOADPACKET_COL_COMPLEX(iter) \ + loadLhsPacket(lhs, i + ((iter) * ResPacketSize), j) + +#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \ + convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter)) + +#ifdef USE_GEMV_MMA +#define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + __builtin_mma_xxsetaccz(&e0##iter); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \ + EIGEN_UNUSED_VARIABLE(f##iter1); +#else +#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \ + if (sizeof(LhsPacket) == 16) { \ + const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \ + a##iter1 = *reinterpret_cast<__vector_pair *>(const_cast(&src)); \ + EIGEN_UNUSED_VARIABLE(f##iter1); \ + } else { \ + f##iter1 = lhs.template load(i + ((iter2) * ResPacketSize), j); \ + GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \ + } +#endif + +#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \ + EIGEN_UNUSED_VARIABLE(a##iter); \ + } else { \ + GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(a##iter); \ + EIGEN_UNUSED_VARIABLE(f##iter); \ + } + +#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + gemv_mult_complex_MMA(f##iter, b, &e0##iter); \ + } else { \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } \ + } + +#define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); + +#define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \ + EIGEN_UNUSED_VARIABLE(a##iter3) \ + } else { \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(a##iter2); \ + EIGEN_UNUSED_VARIABLE(a##iter3); \ + } \ + EIGEN_UNUSED_VARIABLE(f##iter2); \ + EIGEN_UNUSED_VARIABLE(f##iter3); + +#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + PLhsPacket g[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(g), &a##iter2); \ + gemv_mult_complex_MMA(g[0], b, &e0##iter2); \ + gemv_mult_complex_MMA(g[1], b, &e0##iter3); \ + } else { \ + gemv_mult_complex_MMA(a##iter2, b, &e0##iter2); \ + gemv_mult_complex_MMA(a##iter3, b, &e0##iter3); \ + } \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOAD_COL_COMPLEX_MMA(N) \ + if (GEMV_GETN_COMPLEX(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \ + } + +#define GEMV_WORK_COL_COMPLEX_MMA(N) \ + if (GEMV_GETN_COMPLEX(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \ + } +#else +#define GEMV_LOAD_COL_COMPLEX_MMA(N) \ + GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) + +#define GEMV_WORK_COL_COMPLEX_MMA(N) \ + GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) +#endif + +#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \ + disassembleResults(&e0##iter, result0##iter); + +#define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \ + c0##iter = PResPacket(result0##iter.packet[0]); \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \ + } else { \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \ + c0##iter = PResPacket(result0##iter.packet[2]); \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \ + } \ + } + +#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \ + c0##iter2 = PResPacket(result0##iter2.packet[0]); \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + c0##iter3 = PResPacket(result0##iter3.packet[0]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + } else { \ + c0##iter3 = PResPacket(result0##iter2.packet[2]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + c0##iter2 = PResPacket(result0##iter3.packet[0]); \ + c0##iter3 = PResPacket(result0##iter3.packet[2]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + } \ + } + +#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \ + GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \ + Index j = j2; \ + do { \ + const RhsScalar& b1 = rhs2(j, 0); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_LOAD_COL_COMPLEX_MMA(N) \ + GEMV_WORK_COL_COMPLEX_MMA(N) \ + } while (++j < jend); \ + if (GEMV_GETN(N) <= 2) { \ + GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \ + } else { \ + GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \ + } \ + i += (ResPacketSize * N); +#endif + +#define GEMV_INIT_COMPLEX(iter, N) \ + if (N > iter) { \ + c0##iter = pset_zero(); \ + c1##iter = pset_init(c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ + } + +#define GEMV_WORK_COL_COMPLEX(iter, N) \ + if (N > iter) { \ + f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \ + gemv_mult_complex(f##iter, b, c0##iter, c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(f##iter); \ + } + +#define GEMV_STORE_COL_COMPLEX(iter, N) \ + if (N > iter) { \ + if (GEMV_IS_COMPLEX_COMPLEX) { \ + c0##iter = padd(c0##iter, c1##iter); \ + } \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \ + } + +/** \internal main macro for gemv_complex_col - initialize accumulators, multiply and add inputs, and store results */ +#define GEMV_PROCESS_COL_COMPLEX_ONE(N) \ + GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \ + Index j = j2; \ + do { \ + const RhsScalar& b1 = rhs2(j, 0); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \ + i += (ResPacketSize * N); + +#if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA)) +#define USE_GEMV_COL_COMPLEX_MMA +#endif + +#ifdef USE_GEMV_COL_COMPLEX_MMA +#define GEMV_PROCESS_COL_COMPLEX(N) \ + GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) +#else +#if defined(USE_GEMV_MMA) && (__GNUC__ > 10) +#define GEMV_PROCESS_COL_COMPLEX(N) \ + if (sizeof(Scalar) != sizeof(LhsPacket)) { \ + GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \ + } else { \ + GEMV_PROCESS_COL_COMPLEX_ONE(N) \ + } +#else +#define GEMV_PROCESS_COL_COMPLEX(N) \ + GEMV_PROCESS_COL_COMPLEX_ONE(N) +#endif +#endif + +template +EIGEN_STRONG_INLINE void gemv_complex_col( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename packet_traits::type ScalarPacket; + typedef typename packet_traits::type PLhsPacket; + typedef typename packet_traits::type PResPacket; + typedef gemv_traits PTraits; + + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr == 1); + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + RhsMapper rhs2(rhs); + + conj_helper cj; + + const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = PTraits::ResPacketSize, + LhsPacketSize = PTraits::LhsPacketSize, + RhsPacketSize = PTraits::RhsPacketSize, + }; +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH + const Index prefetch_dist = 64 * LhsPacketSize; +#endif + +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = rows - 8 * ResPacketSize + 1; + const Index n4 = rows - 4 * ResPacketSize + 1; + const Index n2 = rows - 2 * ResPacketSize + 1; +#endif + const Index n1 = rows - 1 * ResPacketSize + 1; + + // TODO: improve the following heuristic: + const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8); + + typedef alpha_store AlphaData; + AlphaData alpha_data(alpha); + + for (Index j2 = 0; j2 < cols; j2 += block_cols) + { + Index jend = numext::mini(j2 + block_cols, cols); + Index i = 0; + PResPacket c00, c01, c02, c03, c04, c05, c06, c07; + ResPacket c10, c11, c12, c13, c14, c15, c16, c17; + PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7; +#ifdef USE_GEMV_MMA + __vector_quad e00, e01, e02, e03, e04, e05, e06, e07; + __vector_pair a0, a1, a2, a3, a4, a5, a6, a7; + PacketBlock result00, result01, result02, result03, result04, result05, result06, result07; + GEMV_UNUSED(8, e0) + GEMV_UNUSED(8, result0) + GEMV_UNUSED(8, a) + GEMV_UNUSED(8, f) +#if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA) + if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT) +#endif +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + { + while (i < n8) + { + GEMV_PROCESS_COL_COMPLEX(8) + } + } + while (i < n4) + { + GEMV_PROCESS_COL_COMPLEX(4) + } + if (i < n2) + { + GEMV_PROCESS_COL_COMPLEX(2) + } + if (i < n1) +#else + while (i < n1) +#endif + { + GEMV_PROCESS_COL_COMPLEX_ONE(1) + } + for (;i < rows;++i) + { + ResScalar d0(0); + Index j = j2; + do { + d0 += cj.pmul(lhs(i, j), rhs2(j, 0)); + } while (++j < jend); + res[i] += alpha * d0; + } + } +} + +template struct ScalarBlock { + Scalar scalar[N]; +}; + +#ifdef USE_GEMV_MMA +static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f }; + +/** \internal predux (add elements of a vector) from a MMA accumulator - real results */ +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]); + result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]); + result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]); + result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3); + result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +template<> +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +/** \internal add complex results together */ +template +EIGEN_ALWAYS_INLINE ScalarBlock, 2> addComplexResults(PacketBlock& result0, PacketBlock& result1) +{ + ScalarBlock, 2> cc0; + result0.packet[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(result0.packet[0]), reinterpret_cast(result1.packet[0]))); + result0.packet[2] = reinterpret_cast(vec_mergel(reinterpret_cast(result0.packet[2]), reinterpret_cast(result1.packet[2]))); + result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]); + if (GEMV_IS_COMPLEX_COMPLEX) { + result0.packet[1] = reinterpret_cast(vec_mergeh(reinterpret_cast(result0.packet[1]), reinterpret_cast(result1.packet[1]))); + result0.packet[3] = reinterpret_cast(vec_mergel(reinterpret_cast(result0.packet[3]), reinterpret_cast(result1.packet[3]))); + result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]); + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v; + } else if (ConjugateRhs) { + result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v; + } else { + result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v; + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + } else { + if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex))) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + } + } + cc0.scalar[0].real(result0.packet[0][0]); + cc0.scalar[0].imag(result0.packet[0][1]); + cc0.scalar[1].real(result0.packet[0][2]); + cc0.scalar[1].imag(result0.packet[0][3]); + return cc0; +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock, 2> addComplexResults(PacketBlock&, PacketBlock&) +{ + ScalarBlock, 2> cc0; + EIGEN_UNUSED_VARIABLE(cc0); + return cc0; // Just for compilation +} + +/** \internal predux (add elements of a vector) from a MMA accumulator - complex results */ +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + return addComplexResults(result0, result1); +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0) +{ + PacketBlock result0; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0) +{ + ScalarBlock cc0; + PacketBlock result0; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + if (GEMV_IS_COMPLEX_COMPLEX) { + if (ConjugateLhs) { + result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v; + } else if (ConjugateRhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v; + } else { + result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v; + } + result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2)); + result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2)); + } else { + result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1); + result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1); + } + cc0.scalar[0].real(result0.packet[0][0]); + cc0.scalar[0].imag(result0.packet[0][1]); + cc0.scalar[1].real(result0.packet[2][0]); + cc0.scalar[1].imag(result0.packet[2][1]); + return cc0; +} +#endif + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(ResPacket& a, ResPacket& b) +{ + ScalarBlock cc0; + cc0.scalar[0] = predux(a); + cc0.scalar[1] = predux(b); + return cc0; +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(ResPacket& a, ResPacket& b) +{ + return predux_real(a, b); +} + +#define GEMV_UNROLL_ROW(func, N) \ + func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N) + +#define GEMV_UNROLL_ROW_HALF(func, N) \ + func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N) + +#define GEMV_LOADPACKET_ROW(iter) \ + lhs.template load(i + (iter), j) + +#ifdef USE_GEMV_MMA +#define GEMV_UNROLL3_ROW(func, N, which) \ + func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \ + func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which) + +#define GEMV_UNUSED_ROW(N, which) \ + GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which) + +#define GEMV_INIT_ROW(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_xxsetaccz(&c##iter); \ + } + +#define GEMV_LOADPAIR_ROW(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1)); + +#define GEMV_WORK_ROW(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + pger_vecMMA_acc(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \ + } else { \ + __vector_pair b##iter; \ + GEMV_LOADPAIR_ROW(iter, iter << 1) \ + pger_vecMMA_acc(&c##iter, b##iter, a0); \ + } \ + } + +#define GEMV_PREDUX2(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + if (GEMV_IS_FLOAT) { \ + cc##iter1 = predux_real(&c##iter2, &c##iter3); \ + } else { \ + cc##iter1 = predux_real(&c##iter1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } +#else +#define GEMV_INIT_ROW(iter, N) \ + if (N > iter) { \ + c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ + } + +#define GEMV_WORK_ROW(iter, N) \ + if (N > iter) { \ + c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \ + } + +#define GEMV_PREDUX2(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1 = predux_real(c##iter2, c##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } +#endif + +#define GEMV_MULT(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \ + cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \ + } + +#define GEMV_STORE_ROW(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + storeMaddData(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \ + storeMaddData(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \ + } + +/** \internal main macro for gemv_row - initialize accumulators, multiply and add inputs, predux and store results */ +#define GEMV_PROCESS_ROW(N) \ + for (; i < n##N; i += N) { \ + GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \ + Index j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + RhsPacket a0 = rhs2.template load(j); \ + GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \ + for (; j < cols; ++j) { \ + RhsScalar a0 = rhs2(j); \ + GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \ + } + +template +EIGEN_STRONG_INLINE void gemv_row( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0); + + eigen_internal_assert(rhs.stride() == 1); + conj_helper cj; + conj_helper pcj; + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); + const Index n4 = rows - 3; + const Index n2 = rows - 1; +#endif + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + RhsPacketSize = Traits::RhsPacketSize, + }; + + Index i = 0; +#ifdef USE_GEMV_MMA + __vector_quad c0, c1, c2, c3, c4, c5, c6, c7; + GEMV_UNUSED_ROW(8, c) +#else + ResPacket c0, c1, c2, c3, c4, c5, c6, c7; +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; + GEMV_PROCESS_ROW(8) + GEMV_PROCESS_ROW(4) + GEMV_PROCESS_ROW(2) +#endif + for (; i < rows; ++i) + { + ResPacket d0 = pset1(ResScalar(0)); + Index j = 0; + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) + { + RhsPacket b0 = rhs2.template load(j); + + d0 = pcj.pmadd(lhs.template load(i + 0, j), b0, d0); + } + ResScalar dd0 = predux(d0); + for (; j < cols; ++j) + { + dd0 += cj.pmul(lhs(i, j), rhs2(j)); + } + res[i * resIncr] += alpha * dd0; + } +} + +#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(float) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(double) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(float) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(double) + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1, ResPacket& b1) +{ + if (GEMV_IS_COMPLEX_COMPLEX) { + a0 = padd(a0, a1); + b0 = padd(b0, b1); + } + return predux_complex(a0, b0); +} + +#define GEMV_LOADPACKET_ROW_COMPLEX(iter) \ + loadLhsPacket(lhs, i + (iter), j) + +#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \ + convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter)) + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \ + j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + const RhsScalar& b1 = rhs2(j); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL_ROW(which, N) \ + } + +#define GEMV_PROCESS_END_ROW_COMPLEX(N) \ + for (; j < cols; ++j) { \ + RhsScalar b0 = rhs2(j); \ + GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1)) + +#ifdef USE_GEMV_MMA +#define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + __builtin_mma_xxsetaccz(&e0##iter); \ + } + +#define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1)); + +#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } else { \ + __vector_pair a##iter; \ + GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } \ + } + +#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + cc##iter1 = predux_complex(&e0##iter2, &e0##iter3); \ + } else { \ + cc##iter1 = predux_complex(&e0##iter1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N) + +#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N); \ + } +#endif + +#define GEMV_WORK_ROW_COMPLEX(iter, N) \ + if (N > iter) { \ + PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \ + gemv_mult_complex(a##iter, b, c0##iter, c1##iter); \ + } + +#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1 = predux_complex(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \ + cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \ + } + +#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + storeMaddData(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \ + storeMaddData(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N) + +/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store results */ +#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \ + if (GEMV_IS_COMPLEX_COMPLEX) { \ + c0##iter = padd(c0##iter, c1##iter); \ + } \ + dd0 = predux(c0##iter); + +#if EIGEN_COMP_LLVM +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) + +#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) +#else +// gcc seems to be reading and writing registers unnecessarily to memory. +// Use the old way for complex double until it is fixed. + +#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) \ + lhs.template load(i + (iter), j) + +#define GEMV_INIT_COMPLEX_OLD(iter, N) \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ + if (N > iter) { \ + c1##iter = pset_zero(); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ + } + +#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \ + if (N > iter) { \ + LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \ + c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \ + } + +#define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] = predux(c1##iter2); \ + cc##iter1.scalar[1] = predux(c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \ + j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + RhsPacket b0 = rhs2.template load(j); \ + GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \ + dd0 = predux(c1##iter); + +#if (__GNUC__ > 10) +#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1 +#else +#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW \ + (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX +#endif + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \ + } +#endif + +#ifdef USE_GEMV_MMA +#define GEMV_PROCESS_ROW_COMPLEX(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) +#else +#define GEMV_PROCESS_ROW_COMPLEX(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE(N) +#endif + +template +EIGEN_STRONG_INLINE void gemv_complex_row( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename packet_traits::type ScalarPacket; + typedef typename packet_traits::type PLhsPacket; + typedef typename packet_traits::type PResPacket; + typedef gemv_traits PTraits; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0); + + eigen_internal_assert(rhs.stride() == 1); + conj_helper cj; +#if !EIGEN_COMP_LLVM + conj_helper pcj; +#endif + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); + const Index n4 = rows - 3; + const Index n2 = rows - 1; +#endif + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = PTraits::ResPacketSize, + LhsPacketSize = PTraits::LhsPacketSize, + RhsPacketSize = PTraits::RhsPacketSize, + }; + + Index i = 0, j; + PResPacket c00, c01, c02, c03, c04, c05, c06, c07; + ResPacket c10, c11, c12, c13, c14, c15, c16, c17; +#ifdef USE_GEMV_MMA + __vector_quad e00, e01, e02, e03, e04, e05, e06, e07; + GEMV_UNUSED_ROW(8, e0) + GEMV_UNUSED_EXTRA(1, c0) + GEMV_UNUSED_EXTRA(1, c1) +#endif + ResScalar dd0; +#ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; +#ifdef USE_GEMV_MMA + if (!GEMV_IS_COMPLEX_COMPLEX) +#endif + { + GEMV_PROCESS_ROW_COMPLEX(8) + } + GEMV_PROCESS_ROW_COMPLEX(4) + GEMV_PROCESS_ROW_COMPLEX(2) +#endif + for (; i < rows; ++i) + { + GEMV_PROCESS_ROW_COMPLEX_SINGLE(1) + GEMV_PROCESS_ROW_COMPLEX_PREDUX(0) + for (; j < cols; ++j) + { + dd0 += cj.pmul(lhs(i, j), rhs2(j)); + } + res[i * resIncr] += alpha * dd0; + } +} + +#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_complex_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_complex_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex, float) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex, double) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex, float) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex, double) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex, std::complex) + +#endif // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H + diff --git a/libs/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h old mode 100755 new mode 100644 index 2a44054..b0f8529 --- a/libs/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_ALTIVEC_H #define EIGEN_PACKET_MATH_ALTIVEC_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -39,34 +41,34 @@ typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf; // We don't want to write the same code all the time, but we need to reuse the constants // and it doesn't really work to declare them global, so we define macros instead -#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = {X, X, X, X} -#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = vec_splat_s32(X) -#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \ Packet4ui p4ui_##NAME = {X, X, X, X} -#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \ Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X} -#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \ Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X} -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ Packet2d p2d_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ Packet2l p2l_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) #define DST_CHAN 1 @@ -74,15 +76,17 @@ typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf; #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits::type // These constants are endian-agnostic -static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} -static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u); -static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); -static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} -static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); +static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} +static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} +static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u); +static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); +#ifndef __POWER8_VECTOR__ +static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} +static EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); +#endif static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} #ifndef __VSX__ static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} @@ -100,11 +104,13 @@ static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 }; +#ifndef _ARCH_PWR9 static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; +#endif +#ifdef _BIG_ENDIAN static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; -static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 }; -static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 }; +#endif static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 }; static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 }; @@ -114,15 +120,11 @@ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 // Define global static constants: #ifdef _BIG_ENDIAN static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); -#ifdef __VSX__ -static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -#endif static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; #else static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; -static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; @@ -135,18 +137,18 @@ static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -#ifdef _BIG_ENDIAN -static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -#else -static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -#endif // _BIG_ENDIAN - #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); #else #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #endif +#if EIGEN_COMP_LLVM +#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16") +#else +#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)") +#endif + template <> struct packet_traits : default_packet_traits { typedef Packet4f type; @@ -166,6 +168,9 @@ struct packet_traits : default_packet_traits { HasAbs = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, + HasACos = 1, + HasASin = 1, + HasATan = 1, HasLog = 1, HasExp = 1, #ifdef __VSX__ @@ -175,16 +180,19 @@ struct packet_traits : default_packet_traits { #else HasRsqrt = 0, #endif + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasRint = 1, #else HasSqrt = 0, HasRsqrt = 0, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, + HasTanh = 0, + HasErf = 0, + HasRint = 0, #endif HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, HasNegate = 1, HasBlend = 1 }; @@ -217,16 +225,17 @@ struct packet_traits : default_packet_traits { #else HasRsqrt = 0, #endif + HasRint = 1, #else HasSqrt = 0, HasRsqrt = 0, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, + HasRint = 0, #endif + HasTanh = 0, + HasErf = 0, HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, HasNegate = 1, HasBlend = 1 }; @@ -247,7 +256,8 @@ struct packet_traits : default_packet_traits { HasShift = 1, HasMul = 1, HasDiv = 0, - HasBlend = 1 + HasBlend = 1, + HasCmp = 1 }; }; @@ -265,7 +275,8 @@ struct packet_traits : default_packet_traits { HasSub = 1, HasMul = 1, HasDiv = 0, - HasBlend = 1 + HasBlend = 1, + HasCmp = 1 }; }; @@ -283,7 +294,8 @@ struct packet_traits : default_packet_traits { HasSub = 1, HasMul = 1, HasDiv = 0, - HasBlend = 1 + HasBlend = 1, + HasCmp = 1 }; }; @@ -301,7 +313,8 @@ struct packet_traits : default_packet_traits { HasSub = 1, HasMul = 1, HasDiv = 0, - HasBlend = 1 + HasBlend = 1, + HasCmp = 1 }; }; @@ -319,7 +332,8 @@ struct packet_traits : default_packet_traits { HasSub = 1, HasMul = 1, HasDiv = 0, - HasBlend = 1 + HasBlend = 1, + HasCmp = 1 }; }; @@ -475,6 +489,119 @@ template<> EIGEN_STRONG_INLINE Packet8bf pload(const bfloat16* fr return pload_common(reinterpret_cast(from)); } +template +EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from) +{ + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(from); + EIGEN_DEBUG_ALIGNED_LOAD + // Ignore partial input memory initialized +#if !EIGEN_COMP_LLVM + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif +#ifdef __VSX__ + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); +#else + return vec_ld(0, from); +#endif +#if !EIGEN_COMP_LLVM + #pragma GCC diagnostic pop +#endif +} + +template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore(const bfloat16* from) +{ + return pload_ignore(reinterpret_cast(from)); +} + +template +EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset) +{ + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); + EIGEN_DEBUG_ALIGNED_LOAD + EIGEN_UNUSED_VARIABLE(from); + Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); + if (offset) { + Packet16uc shift = pset1(offset * 8 * size); +#ifdef _BIG_ENDIAN + load = Packet(vec_sro(Packet16uc(load), shift)); +#else + load = Packet(vec_slo(Packet16uc(load), shift)); +#endif + } + return load; +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size]; + unsigned char* load2 = reinterpret_cast(load + offset); + unsigned char* from2 = reinterpret_cast(const_cast<__UNPACK_TYPE__(Packet)*>(from)); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstoreu(load2, ploadu(from2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + } + return pload_ignore(load); +#endif +} + +template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial(const float* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial(const int* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial(const short int* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial(const unsigned short int* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial(const bfloat16* from, const Index n, const Index offset) +{ + return pload_partial_common(reinterpret_cast(from), n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial(const signed char* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial(const unsigned char* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + template EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ // some versions of GCC throw "unused-but-set-parameter" (float *to). @@ -523,6 +650,91 @@ template<> EIGEN_STRONG_INLINE void pstore(unsigned char* t pstore_common(to, from); } +template EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset) +{ + // some versions of GCC throw "unused-but-set-parameter" (float *to). + // ignoring these warnings for now. + const Index packet_size = unpacket_traits::size; + eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); + EIGEN_UNUSED_VARIABLE(to); + EIGEN_DEBUG_ALIGNED_STORE + Packet store = from; + if (offset) { + Packet16uc shift = pset1(offset * 8 * size); +#ifdef _BIG_ENDIAN + store = Packet(vec_slo(Packet16uc(store), shift)); +#else + store = Packet(vec_sro(Packet16uc(store), shift)); +#endif + } + vec_xst_len(store, to, n * size); +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size]; + pstore(store, from); + unsigned char* store2 = reinterpret_cast(store + offset); + unsigned char* to2 = reinterpret_cast(to); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstore(to2, ploadu(store2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + } +#endif +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(float* to, const Packet4f& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(int* to, const Packet4i& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(short int* to, const Packet8s& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(unsigned short int* to, const Packet8us& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(bfloat16* to, const Packet8bf& from, const Index n, const Index offset) +{ + pstore_partial_common(reinterpret_cast(to), from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(signed char* to, const Packet16c& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + +template<> EIGEN_ALWAYS_INLINE void pstore_partial(unsigned char* to, const Packet16uc& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + template EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) { @@ -600,168 +812,167 @@ pbroadcast4(const int *a, pbroadcast4_common(a, a0, a1, a2, a3); } -template EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits::size) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; - a[0] = from[0*stride]; - a[1] = from[1*stride]; - a[2] = from[2*stride]; - a[3] = from[3*stride]; - return pload(a); + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits::size]; + eigen_assert(n <= unpacket_traits::size && "number of elements will gather past end of packet"); + LOAD_STORE_UNROLL_16 + for (Index i = 0; i < n; i++) { + a[i] = from[i*stride]; + } + // Leave rest of the array uninitialized + return pload_ignore(a); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather(const float* from, Index stride) { return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather(const int* from, Index stride) { return pgather_common(from, stride); } -template EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather(const short int* from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; - a[0] = from[0*stride]; - a[1] = from[1*stride]; - a[2] = from[2*stride]; - a[3] = from[3*stride]; - a[4] = from[4*stride]; - a[5] = from[5*stride]; - a[6] = from[6*stride]; - a[7] = from[7*stride]; - return pload(a); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet8s pgather(const short int* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather(const unsigned short int* from, Index stride) { - return pgather_size8(from, stride); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(const unsigned short int* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather(const bfloat16* from, Index stride) { - return pgather_size8(from, stride); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather(const bfloat16* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather(const signed char* from, Index stride) { - return pgather_size8(from, stride); + return pgather_common(from, stride); } -template EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather(const unsigned char* from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; - a[0] = from[0*stride]; - a[1] = from[1*stride]; - a[2] = from[2*stride]; - a[3] = from[3*stride]; - a[4] = from[4*stride]; - a[5] = from[5*stride]; - a[6] = from[6*stride]; - a[7] = from[7*stride]; - a[8] = from[8*stride]; - a[9] = from[9*stride]; - a[10] = from[10*stride]; - a[11] = from[11*stride]; - a[12] = from[12*stride]; - a[13] = from[13*stride]; - a[14] = from[14*stride]; - a[15] = from[15*stride]; - return pload(a); + return pgather_common(from, stride); } - -template<> EIGEN_DEVICE_FUNC inline Packet16c pgather(const signed char* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial(const float* from, Index stride, const Index n) { - return pgather_size16(from, stride); + return pgather_common(from, stride, n); } -template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather(const unsigned char* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial(const int* from, Index stride, const Index n) { - return pgather_size16(from, stride); + return pgather_common(from, stride, n); } -template EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial(const short int* from, Index stride, const Index n) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial(const unsigned short int* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial(const bfloat16* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial(const signed char* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial(const unsigned char* from, Index stride, const Index n) +{ + return pgather_common(from, stride, n); +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits::size) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits::size]; + eigen_assert(n <= unpacket_traits::size && "number of elements will scatter past end of packet"); pstore<__UNPACK_TYPE__(Packet)>(a, from); - to[0*stride] = a[0]; - to[1*stride] = a[1]; - to[2*stride] = a[2]; - to[3*stride] = a[3]; + LOAD_STORE_UNROLL_16 + for (Index i = 0; i < n; i++) { + to[i*stride] = a[i]; + } } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(float* to, const Packet4f& from, Index stride) { - pscatter_size4(to, from, stride); + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(int* to, const Packet4i& from, Index stride) { - pscatter_size4(to, from, stride); + pscatter_common(to, from, stride); } -template EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(short int* to, const Packet8s& from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; - pstore<__UNPACK_TYPE__(Packet)>(a, from); - to[0*stride] = a[0]; - to[1*stride] = a[1]; - to[2*stride] = a[2]; - to[3*stride] = a[3]; - to[4*stride] = a[4]; - to[5*stride] = a[5]; - to[6*stride] = a[6]; - to[7*stride] = a[7]; + pscatter_common(to, from, stride); } - -template<> EIGEN_DEVICE_FUNC inline void pscatter(short int* to, const Packet8s& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(unsigned short int* to, const Packet8us& from, Index stride) { - pscatter_size8(to, from, stride); + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned short int* to, const Packet8us& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(bfloat16* to, const Packet8bf& from, Index stride) { - pscatter_size8(to, from, stride); + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(signed char* to, const Packet16c& from, Index stride) { - pscatter_size8(to, from, stride); + pscatter_common(to, from, stride); } -template EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(unsigned char* to, const Packet16uc& from, Index stride) { - EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; - pstore<__UNPACK_TYPE__(Packet)>(a, from); - to[0*stride] = a[0]; - to[1*stride] = a[1]; - to[2*stride] = a[2]; - to[3*stride] = a[3]; - to[4*stride] = a[4]; - to[5*stride] = a[5]; - to[6*stride] = a[6]; - to[7*stride] = a[7]; - to[8*stride] = a[8]; - to[9*stride] = a[9]; - to[10*stride] = a[10]; - to[11*stride] = a[11]; - to[12*stride] = a[12]; - to[13*stride] = a[13]; - to[14*stride] = a[14]; - to[15*stride] = a[15]; + pscatter_common(to, from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(signed char* to, const Packet16c& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(float* to, const Packet4f& from, Index stride, const Index n) { - pscatter_size16(to, from, stride); + pscatter_common(to, from, stride, n); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned char* to, const Packet16uc& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(int* to, const Packet4i& from, Index stride, const Index n) { - pscatter_size16(to, from, stride); + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(short int* to, const Packet8s& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(unsigned short int* to, const Packet8us& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(bfloat16* to, const Packet8bf& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(signed char* to, const Packet16c& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(unsigned char* to, const Packet16uc& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); } template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } @@ -786,8 +997,22 @@ template<> EIGEN_STRONG_INLINE Packet8us psub (const Packet8us& a, template<> EIGEN_STRONG_INLINE Packet16c psub (const Packet16c& a, const Packet16c& b) { return a - b; } template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return p4f_ZERO - a; +#endif +} +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return p4i_ZERO - a; +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -829,6 +1054,12 @@ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); } +#ifdef __VSX__ +template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_msub(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmsub(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmadd(a,b,c); } +#endif + template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { #ifdef __VSX__ @@ -872,19 +1103,29 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const return vec_nor(c,c); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } @@ -937,6 +1178,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { Packet4f res; @@ -947,11 +1189,15 @@ template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) return res; } +#endif template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) { EIGEN_DEBUG_ALIGNED_LOAD -#ifdef _BIG_ENDIAN +#if defined(__VSX__) || !defined(_BIG_ENDIAN) + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); +#else Packet16uc MSQ, LSQ; Packet16uc mask; MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword @@ -959,9 +1205,6 @@ template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPAC mask = vec_lvsl(0, from); // create the permute mask //TODO: Add static_cast here return (Packet) vec_perm(MSQ, LSQ, mask); // align the data -#else - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #endif } @@ -994,12 +1237,80 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const unsigned char return ploadu_common(from); } +template EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will read past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); + EIGEN_DEBUG_ALIGNED_LOAD + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size); +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size]; + unsigned char* load2 = reinterpret_cast(load); + unsigned char* from2 = reinterpret_cast(const_cast<__UNPACK_TYPE__(Packet)*>(from)); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstore(load2, ploadu(from2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(load2 + i) = *reinterpret_cast(from2 + i); + } + return pload_ignore(load); +#endif +} + +template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial(const float* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial(const int* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial(const short int* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial(const unsigned short int* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial(const bfloat16* from, const Index n) +{ + return ploadu_partial_common(reinterpret_cast(from), n); +} +template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial(const signed char* from, const Index n) +{ + return ploadu_partial_common(from, n); +} +template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial(const unsigned char* from, const Index n) +{ + return ploadu_partial_common(from, n); +} + template EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) { Packet p; if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); + return vec_mergeh(p, p); } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { @@ -1015,7 +1326,7 @@ template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const short int* Packet8s p; if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE16_HI); + return vec_mergeh(p, p); } template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const unsigned short int* from) @@ -1023,7 +1334,7 @@ template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const unsigned shor Packet8us p; if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE16_HI); + return vec_mergeh(p, p); } template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const short int* from) @@ -1052,7 +1363,7 @@ template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const signed char* Packet16c p; if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE8_HI); + return vec_mergeh(p, p); } template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const unsigned char* from) @@ -1060,13 +1371,15 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const unsigned ch Packet16uc p; if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE8_HI); + return vec_mergeh(p, p); } template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) { EIGEN_DEBUG_UNALIGNED_STORE -#ifdef _BIG_ENDIAN +#if defined(__VSX__) || !defined(_BIG_ENDIAN) + vec_xst(from, 0, to); +#else // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Warning: not thread safe! Packet16uc MSQ, LSQ, edges; @@ -1081,8 +1394,6 @@ template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE_ LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second -#else - vec_xst(from, 0, to); #endif } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) @@ -1114,6 +1425,73 @@ template<> EIGEN_STRONG_INLINE void pstoreu(unsigned char* t pstoreu_common(to, from); } +template EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n) +{ + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will write past end of packet"); + const Index size = sizeof(__UNPACK_TYPE__(Packet)); +#ifdef _ARCH_PWR9 + EIGEN_UNUSED_VARIABLE(packet_size); + EIGEN_DEBUG_UNALIGNED_STORE + vec_xst_len(from, to, n * size); +#else + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size]; + pstore(store, from); + unsigned char* store2 = reinterpret_cast(store); + unsigned char* to2 = reinterpret_cast(to); + Index n2 = n * size; + Index i = 0; + if (16 <= n2) { + pstoreu(to2, pload(store2)); + i += 16; + } + if (i + 8 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 8; + } + if (i + 4 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 4; + } + if (i + 2 <= n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + i += 2; + } + if (i < n2) { + *reinterpret_cast(to2 + i) = *reinterpret_cast(store2 + i); + } +#endif +} + +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(float* to, const Packet4f& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(int* to, const Packet4i& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(short int* to, const Packet8s& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned short int* to, const Packet8us& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(bfloat16* to, const Packet8bf& from, const Index n) +{ + pstoreu_partial_common(reinterpret_cast(to), from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(signed char* to, const Packet16c& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(unsigned char* to, const Packet16uc& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} + template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } @@ -1162,11 +1540,19 @@ template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) } template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) { +#ifdef _ARCH_PWR9 + return vec_revb(a); +#else return vec_perm(a, a, p16uc_REVERSE8); +#endif } template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) { +#ifdef _ARCH_PWR9 + return vec_revb(a); +#else return vec_perm(a, a, p16uc_REVERSE8); +#endif } template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) { @@ -1180,10 +1566,13 @@ template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { - _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF); + EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF); return pand(p8us_abs_mask, a); } +template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return vec_sra(a.m_val, vec_splat_u16(15)); } +template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31))); } + template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return vec_sra(a,reinterpret_cast(pset1(N))); } template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) @@ -1192,38 +1581,38 @@ template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& { return vec_sl(a,reinterpret_cast(pset1(N))); } template EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) { - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); Packet4ui r = vec_sl(reinterpret_cast(a), p4ui_mask); return reinterpret_cast(r); } template EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) { - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); Packet4ui r = vec_sr(reinterpret_cast(a), p4ui_mask); return reinterpret_cast(r); } template EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) { - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); return vec_sr(a, p4ui_mask); } template EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) { - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); return vec_sl(a, p4ui_mask); } template EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) { - const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); + const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); return vec_sl(a, p8us_mask); } template EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) { - const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); + const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); return vec_sr(a, p8us_mask); } @@ -1232,7 +1621,7 @@ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){ } EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){ - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); return pand( reinterpret_cast(bf.m_val), reinterpret_cast(p4ui_high_mask) @@ -1242,7 +1631,7 @@ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){ // Simple interleaving of bool masks, prevents true values from being // converted to NaNs. EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) { - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); Packet4f bf_odd, bf_even; bf_odd = pand(reinterpret_cast(p4ui_high_mask), odd); bf_even = plogical_shift_right<16>(even); @@ -1254,18 +1643,18 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ Packet4ui lsb = plogical_shift_right<16>(input); lsb = pand(lsb, reinterpret_cast(p4i_ONE)); - _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu); + EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu); Packet4ui rounding_bias = padd(lsb, p4ui_BIAS); input = padd(input, rounding_bias); //Test NaN and Subnormal - Begin - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000); Packet4ui exp = pand(p4ui_exp_mask, reinterpret_cast(p4f)); - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF); Packet4ui mantissa = pand(p4ui_mantissa_mask, reinterpret_cast(p4f)); - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000); Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp); Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast(p4i_ZERO)); @@ -1280,7 +1669,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ reinterpret_cast(is_mant_zero) ); - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); input = vec_sel(input, p4ui_nan, nan_selector); input = vec_sel(input, reinterpret_cast(p4f), subnormal_selector); //Test NaN and Subnormal - End @@ -1341,12 +1730,6 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, con BF16_TO_F32_BINARY_OP_WRAPPER(psub, a, b); } -template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); -} -template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); -} template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); } @@ -1390,9 +1773,11 @@ template<> EIGEN_STRONG_INLINE Packet8bf pceil (const Packet8bf& a){ template<> EIGEN_STRONG_INLINE Packet8bf pround (const Packet8bf& a){ BF16_TO_F32_UNARY_OP_WRAPPER(pround, a); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet8bf print (const Packet8bf& a){ BF16_TO_F32_UNARY_OP_WRAPPER(print, a); } +#endif template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { Packet4f a_even = Bf16ToF32Even(a); Packet4f a_odd = Bf16ToF32Odd(a); @@ -2100,7 +2485,11 @@ ptranspose(PacketBlock& kernel) { template EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; +#ifdef __POWER8_VECTOR__ + Packet4ui mask = reinterpret_cast(vec_neg(reinterpret_cast(select))); +#else Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); +#endif return vec_sel(elsePacket, thenPacket, mask); } @@ -2115,7 +2504,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) { Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; +#ifdef __POWER8_VECTOR__ + Packet8us mask = reinterpret_cast(vec_neg(reinterpret_cast(select))); +#else Packet8us mask = reinterpret_cast(vec_cmpeq(select, p8us_ONE)); +#endif Packet8s result = vec_sel(elsePacket, thenPacket, mask); return result; } @@ -2123,7 +2516,11 @@ template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, cons template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) { Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; +#ifdef __POWER8_VECTOR__ + Packet8us mask = reinterpret_cast(vec_neg(reinterpret_cast(select))); +#else Packet8us mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p8us_ONE)); +#endif return vec_sel(elsePacket, thenPacket, mask); } @@ -2137,7 +2534,11 @@ template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, co ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; +#ifdef __POWER8_VECTOR__ + Packet16uc mask = reinterpret_cast(vec_neg(reinterpret_cast(select))); +#else Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); +#endif return vec_sel(elsePacket, thenPacket, mask); } @@ -2147,7 +2548,11 @@ template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, c ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; +#ifdef __POWER8_VECTOR__ + Packet16uc mask = reinterpret_cast(vec_neg(reinterpret_cast(select))); +#else Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); +#endif return vec_sel(elsePacket, thenPacket, mask); } @@ -2208,7 +2613,7 @@ template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packe Packet4f float_odd = Bf16ToF32Odd(a); Packet4ui int_even = pcast(float_even); Packet4ui int_odd = pcast(float_odd); - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); Packet4ui low_even = pand(int_even, p4ui_low_mask); Packet4ui low_odd = pand(int_odd, p4ui_low_mask); @@ -2231,7 +2636,7 @@ template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packe template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8us& a) { //short -> int -> float -> bfloat16 - const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); Packet4ui int_cast = reinterpret_cast(a); Packet4ui int_even = pand(int_cast, p4ui_low_mask); Packet4ui int_odd = plogical_shift_right<16>(int_cast); @@ -2301,6 +2706,7 @@ template<> struct packet_traits : default_packet_traits HasAbs = 1, HasSin = 0, HasCos = 0, + HasATan = 0, HasLog = 0, HasExp = 1, HasSqrt = 1, @@ -2345,12 +2751,22 @@ template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) return vec_xl(0, const_cast(from)); // cast needed by Clang } +template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial(const double* from, const Index n, const Index offset) +{ + return pload_partial_common(from, n, offset); +} + template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_xst(from, 0, to); } +template<> EIGEN_ALWAYS_INLINE void pstore_partial(double* to, const Packet2d& from, const Index n, const Index offset) +{ + pstore_partial_common(to, from, n, offset); +} + template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { Packet2d v = {from, from}; return v; @@ -2372,19 +2788,21 @@ pbroadcast4(const double *a, a3 = pset1(a[3]); } -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather(const double* from, Index stride) { - EIGEN_ALIGN16 double af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial(const double* from, Index stride, const Index n) { - EIGEN_ALIGN16 double af[2]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; + return pgather_common(from, stride, n); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter(double* to, const Packet2d& from, Index stride) +{ + pscatter_common(to, from, stride); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial(double* to, const Packet2d& from, Index stride, const Index n) +{ + pscatter_common(to, from, stride, n); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return pset1(a) + p2d_COUNTDOWN; } @@ -2393,7 +2811,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return p2d_ZERO - a; +#endif +} template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } @@ -2402,6 +2827,9 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_msub(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmsub(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmadd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { @@ -2465,6 +2893,11 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) return vec_xl(0, const_cast(from)); } +template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial(const double* from, const Index n) +{ + return ploadu_partial_common(from, n); +} + template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p; @@ -2479,16 +2912,21 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& vec_xst(from, 0, to); } +template<> EIGEN_ALWAYS_INLINE void pstoreu_partial(double* to, const Packet2d& from, const Index n) +{ + pstoreu_partial_common(to, from, n); +} + template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); + return vec_sld(a, a, 8); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } - +template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63))); } // VSX support varies between different compilers and even different // versions of the same compiler. For gcc version >= 4.9.3, we can use // vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use @@ -2571,7 +3009,7 @@ template struct plogical_shift_left_impl; template -struct plogical_shift_left_impl= 0)>::type> { +struct plogical_shift_left_impl= 0)>> { static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { static const unsigned n = static_cast(N); const Packet4ui shift = {n, n, n, n}; @@ -2585,7 +3023,7 @@ struct plogical_shift_left_impl= 0)>::typ }; template -struct plogical_shift_left_impl= 32)>::type> { +struct plogical_shift_left_impl= 32)>> { static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { static const unsigned m = static_cast(N - 32); const Packet4ui shift = {m, m, m, m}; @@ -2603,7 +3041,7 @@ template struct plogical_shift_right_impl; template -struct plogical_shift_right_impl= 0)>::type> { +struct plogical_shift_right_impl= 0)>> { static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { static const unsigned n = static_cast(N); const Packet4ui shift = {n, n, n, n}; @@ -2617,7 +3055,7 @@ struct plogical_shift_right_impl= 0)>::ty }; template -struct plogical_shift_right_impl= 32)>::type> { +struct plogical_shift_right_impl= 32)>> { static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { static const unsigned m = static_cast(N - 32); const Packet4ui shift = {m, m, m, m}; @@ -2690,8 +3128,8 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet2d t0, t1; - t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); - t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[1]); kernel.packet[0] = t0; kernel.packet[1] = t1; } diff --git a/libs/eigen/Eigen/src/Core/arch/Default/BFloat16.h b/libs/eigen/Eigen/src/Core/arch/Default/BFloat16.h index 1c28f4f..d2137d4 100644 --- a/libs/eigen/Eigen/src/Core/arch/Default/BFloat16.h +++ b/libs/eigen/Eigen/src/Core/arch/Default/BFloat16.h @@ -16,6 +16,20 @@ limitations under the License. #ifndef EIGEN_BFLOAT16_H #define EIGEN_BFLOAT16_H +#include "../../InternalHeaderCheck.h" + +#if defined(EIGEN_HAS_HIP_BF16) +// When compiling with GPU support, the "hip_bfloat16" base class as well as +// some other routines are defined in the GPU compiler header files +// (hip_bfloat16.h), and they are not tagged constexpr +// As a consequence, we get compile failures when compiling Eigen with +// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building +// Eigen with GPU support + #pragma push_macro("EIGEN_CONSTEXPR") + #undef EIGEN_CONSTEXPR + #define EIGEN_CONSTEXPR +#endif + #define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD) \ template <> \ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED \ @@ -23,19 +37,47 @@ limitations under the License. return F32ToBf16(METHOD(Bf16ToF32(_x))); \ } +// Only use HIP GPU bf16 in kernels +#if defined(EIGEN_HAS_HIP_BF16) && defined(EIGEN_GPU_COMPILE_PHASE) +#define EIGEN_USE_HIP_BF16 +#endif + namespace Eigen { struct bfloat16; +namespace numext { +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast(const uint16_t& src); + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(const Eigen::bfloat16& src); +} // namespace numext namespace bfloat16_impl { +#if defined(EIGEN_USE_HIP_BF16) + +struct __bfloat16_raw : public hip_bfloat16 { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() {} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(hip_bfloat16 hb) : hip_bfloat16(hb) {} + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : hip_bfloat16(raw) {} +}; + +#else + // Make our own __bfloat16_raw definition. struct __bfloat16_raw { +#if defined(EIGEN_HAS_HIP_BF16) && !defined(EIGEN_GPU_COMPILE_PHASE) + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() {} +#else EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {} +#endif explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {} unsigned short value; }; +#endif // defined(EIGEN_USE_HIP_BF16) + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value); template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); @@ -83,57 +125,114 @@ struct bfloat16 : public bfloat16_impl::bfloat16_base { return bfloat16_impl::bfloat16_to_float(*this); } }; -} // namespace Eigen -namespace std { -template<> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = std::denorm_absent; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = numeric_limits::round_style; - static const bool is_iec559 = false; - static const bool is_bounded = true; - static const bool is_modulo = false; - static const int digits = 8; - static const int digits10 = 2; - static const int max_digits10 = 4; - static const int radix = 2; - static const int min_exponent = numeric_limits::min_exponent; - static const int min_exponent10 = numeric_limits::min_exponent10; - static const int max_exponent = numeric_limits::max_exponent; - static const int max_exponent10 = numeric_limits::max_exponent10; - static const bool traps = numeric_limits::traps; - static const bool tinyness_before = numeric_limits::tinyness_before; +// TODO(majnemer): Get rid of this once we can rely on C++17 inline variables do +// solve the ODR issue. +namespace bfloat16_impl { +template +struct numeric_limits_bfloat16_impl { + static EIGEN_CONSTEXPR const bool is_specialized = true; + static EIGEN_CONSTEXPR const bool is_signed = true; + static EIGEN_CONSTEXPR const bool is_integer = false; + static EIGEN_CONSTEXPR const bool is_exact = false; + static EIGEN_CONSTEXPR const bool has_infinity = true; + static EIGEN_CONSTEXPR const bool has_quiet_NaN = true; + static EIGEN_CONSTEXPR const bool has_signaling_NaN = true; + static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present; + static EIGEN_CONSTEXPR const bool has_denorm_loss = false; + static EIGEN_CONSTEXPR const std::float_round_style round_style = std::numeric_limits::round_style; + static EIGEN_CONSTEXPR const bool is_iec559 = true; + // The C++ standard defines this as "true if the set of values representable + // by the type is finite." BFloat16 has finite precision. + static EIGEN_CONSTEXPR const bool is_bounded = true; + static EIGEN_CONSTEXPR const bool is_modulo = false; + static EIGEN_CONSTEXPR const int digits = 8; + static EIGEN_CONSTEXPR const int digits10 = 2; + static EIGEN_CONSTEXPR const int max_digits10 = 4; + static EIGEN_CONSTEXPR const int radix = std::numeric_limits::radix; + static EIGEN_CONSTEXPR const int min_exponent = std::numeric_limits::min_exponent; + static EIGEN_CONSTEXPR const int min_exponent10 = std::numeric_limits::min_exponent10; + static EIGEN_CONSTEXPR const int max_exponent = std::numeric_limits::max_exponent; + static EIGEN_CONSTEXPR const int max_exponent10 = std::numeric_limits::max_exponent10; + static EIGEN_CONSTEXPR const bool traps = std::numeric_limits::traps; + // IEEE754: "The implementer shall choose how tininess is detected, but shall + // detect tininess in the same way for all operations in radix two" + static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits::tinyness_before; - static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); } - static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); } - static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); } - static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); } - static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); } - static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); } - static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); } - static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); } - static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); } + static EIGEN_CONSTEXPR Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); } + static EIGEN_CONSTEXPR Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); } + static EIGEN_CONSTEXPR Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); } + static EIGEN_CONSTEXPR Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); } + static EIGEN_CONSTEXPR Eigen::bfloat16 round_error() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3f00); } + static EIGEN_CONSTEXPR Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); } + static EIGEN_CONSTEXPR Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); } + static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fa0); } + static EIGEN_CONSTEXPR Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); } }; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::is_specialized; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::is_signed; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::is_integer; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::is_exact; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::has_infinity; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::has_quiet_NaN; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::has_signaling_NaN; +template +EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_bfloat16_impl::has_denorm; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::has_denorm_loss; +template +EIGEN_CONSTEXPR const std::float_round_style numeric_limits_bfloat16_impl::round_style; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::is_iec559; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::is_bounded; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::is_modulo; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::digits; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::digits10; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::max_digits10; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::radix; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::min_exponent; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::min_exponent10; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::max_exponent; +template +EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl::max_exponent10; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::traps; +template +EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl::tinyness_before; +} // end namespace bfloat16_impl +} // end namespace Eigen + +namespace std { // If std::numeric_limits is specialized, should also specialize // std::numeric_limits, std::numeric_limits, and // std::numeric_limits // https://stackoverflow.com/a/16519653/ template<> -struct numeric_limits : numeric_limits {}; +class numeric_limits : public Eigen::bfloat16_impl::numeric_limits_bfloat16_impl<> {}; template<> -struct numeric_limits : numeric_limits {}; +class numeric_limits : public numeric_limits {}; template<> -struct numeric_limits : numeric_limits {}; -} // namespace std +class numeric_limits : public numeric_limits {}; +template<> +class numeric_limits : public numeric_limits {}; +} // end namespace std namespace Eigen { @@ -148,7 +247,7 @@ namespace bfloat16_impl { // We need to provide emulated *host-side* BF16 operators for clang. #pragma push_macro("EIGEN_DEVICE_FUNC") #undef EIGEN_DEVICE_FUNC -#if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16) +#if (defined(EIGEN_HAS_GPU_BF16) && defined(EIGEN_HAS_NATIVE_BF16)) #define EIGEN_DEVICE_FUNC __host__ #else // both host and device need emulated ops. #define EIGEN_DEVICE_FUNC __host__ __device__ @@ -177,9 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, co return bfloat16(float(a) / float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) { - bfloat16 result; - result.value = a.value ^ 0x8000; - return result; + numext::uint16_t x = numext::bit_cast(a) ^ 0x8000; + return numext::bit_cast(x); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) { a = bfloat16(float(a) + float(b)); @@ -246,38 +344,47 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, In } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) { +#if defined(EIGEN_USE_HIP_BF16) + return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(v, __bfloat16_raw::truncate)); +#else __bfloat16_raw output; - if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) { + if (numext::isnan EIGEN_NOT_A_MACRO(v)) { output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; return output; } - const uint16_t* p = reinterpret_cast(&v); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - output.value = p[0]; -#else - output.value = p[1]; -#endif + output.value = static_cast(numext::bit_cast(v) >> 16); return output; +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) { +#if defined(EIGEN_USE_HIP_BF16) + __bfloat16_raw bf; + bf.data = value; + return bf; +#else return __bfloat16_raw(value); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) { +#if defined(EIGEN_USE_HIP_BF16) + return bf.data; +#else return bf.value; +#endif } // float_to_bfloat16_rtne template specialization that does not make any // assumption about the value of its function argument (ff). template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { -#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) - // Nothing to do here +#if defined(EIGEN_USE_HIP_BF16) + return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff)); #else __bfloat16_raw output; - if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) { + if (numext::isnan EIGEN_NOT_A_MACRO(ff)) { // If the value is a NaN, squash it to a qNaN with msb of fraction set, // this makes sure after truncation we don't end up with an inf. // @@ -446,8 +553,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { -#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) - // Nothing to do here +#if defined(EIGEN_USE_HIP_BF16) + return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff)); #else numext::uint32_t input = numext::bit_cast(ff); __bfloat16_raw output; @@ -462,36 +569,41 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(&result); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - q[0] = h.value; +#if defined(EIGEN_USE_HIP_BF16) + return static_cast(h); #else - q[1] = h.value; + return numext::bit_cast(static_cast(h.value) << 16); #endif - return result; } + // --- standard functions --- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) { EIGEN_USING_STD(isinf); +#if defined(EIGEN_USE_HIP_BF16) + return (isinf)(a); // Uses HIP hip_bfloat16 isinf operator +#else return (isinf)(float(a)); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) { EIGEN_USING_STD(isnan); +#if defined(EIGEN_USE_HIP_BF16) + return (isnan)(a); // Uses HIP hip_bfloat16 isnan operator +#else return (isnan)(float(a)); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) { return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) { - bfloat16 result; - result.value = a.value & 0x7FFF; - return result; + numext::uint16_t x = numext::bit_cast(a) & 0x7FFF; + return numext::bit_cast(x); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { - return bfloat16(::expf(float(a))); + return bfloat16(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { return bfloat16(numext::expm1(float(a))); @@ -509,11 +621,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) { return bfloat16(static_cast(EIGEN_LOG2E) * ::logf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { - return bfloat16(::sqrtf(float(a))); + return bfloat16(::sqrtf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) { return bfloat16(::powf(float(a), float(b))); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan2(const bfloat16& a, const bfloat16& b) { + return bfloat16(::atan2f(float(a), float(b))); +} EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { return bfloat16(::sinf(float(a))); } @@ -541,7 +656,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { return bfloat16(::tanhf(float(a))); } -#if EIGEN_HAS_CXX11_MATH EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { return bfloat16(::asinhf(float(a))); } @@ -551,7 +665,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { return bfloat16(::atanhf(float(a))); } -#endif EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { return bfloat16(::floorf(float(a))); } @@ -573,6 +686,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bf const float f2 = static_cast(b); return f2 < f1 ? b : a; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) { const float f1 = static_cast(a); const float f2 = static_cast(b); @@ -584,6 +698,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfl const float f2 = static_cast(b); return bfloat16(::fminf(f1, f2)); } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) { const float f1 = static_cast(a); const float f2 = static_cast(b); @@ -633,7 +748,6 @@ template<> struct NumTraits } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() { return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D); // bfloat16(5e-2f); - } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() { return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F); @@ -651,6 +765,11 @@ template<> struct NumTraits } // namespace Eigen + +#if defined(EIGEN_HAS_HIP_BF16) + #pragma pop_macro("EIGEN_CONSTEXPR") +#endif + namespace Eigen { namespace numext { @@ -674,7 +793,7 @@ bool (isfinite)(const Eigen::bfloat16& h) { template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast(const uint16_t& src) { - return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src)); + return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src); } template <> @@ -696,5 +815,49 @@ struct hash { } // namespace std #endif +// Add the missing shfl* intrinsics. +// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. +// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) +// +// HIP and CUDA prior to SDK 9.0 define +// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float +// CUDA since 9.0 deprecates those and instead defines +// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, +// with native support for __half and __nv_bfloat16 +// +// Note that the following are __device__ - only functions. +#if defined(EIGEN_HIPCC) + +#if defined(EIGEN_HAS_HIP_BF16) + +__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl(Eigen::bfloat16 var, int srcLane, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl(ivar, srcLane, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_up(Eigen::bfloat16 var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_up(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_down(Eigen::bfloat16 var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_down(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_xor(Eigen::bfloat16 var, int laneMask, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_xor(ivar, laneMask, width))); +} + +#endif // HIP + +#endif // __shfl* + +#if defined(EIGEN_HIPCC) +EIGEN_STRONG_INLINE __device__ Eigen::bfloat16 __ldg(const Eigen::bfloat16* ptr) { + return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(__ldg(Eigen::numext::bit_cast(ptr))); +} +#endif // __ldg #endif // EIGEN_BFLOAT16_H diff --git a/libs/eigen/Eigen/src/Core/arch/Default/ConjHelper.h b/libs/eigen/Eigen/src/Core/arch/Default/ConjHelper.h index 53830b5..6b5afe3 100644 --- a/libs/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +++ b/libs/eigen/Eigen/src/Core/arch/Default/ConjHelper.h @@ -38,6 +38,8 @@ } \ }; +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index c9fbaf6..3060214 100644 --- a/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -16,6 +16,8 @@ #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -26,11 +28,11 @@ template<> struct make_integer { typedef numext::int64_t type; }; template<> struct make_integer { typedef numext::int16_t type; }; template<> struct make_integer { typedef numext::int16_t type; }; -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) { typedef typename unpacket_traits::type Scalar; typedef typename unpacket_traits::integer_packet PacketI; - enum { mantissa_bits = numext::numeric_limits::digits - 1}; + static constexpr int mantissa_bits = numext::numeric_limits::digits - 1; return pcast(plogical_shift_right(preinterpret(pabs(a)))); } @@ -40,42 +42,41 @@ template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) { typedef typename unpacket_traits::type Scalar; typedef typename make_unsigned::type>::type ScalarUI; - enum { + static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, - ExponentBits = int(TotalBits) - int(MantissaBits) - 1 - }; + ExponentBits = TotalBits - MantissaBits - 1; - EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = - ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000 - const Packet sign_mantissa_mask = pset1frombits(static_cast(scalar_sign_mantissa_mask)); + EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = + ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000 + const Packet sign_mantissa_mask = pset1frombits(static_cast(scalar_sign_mantissa_mask)); const Packet half = pset1(Scalar(0.5)); const Packet zero = pzero(a); const Packet normal_min = pset1((numext::numeric_limits::min)()); // Minimum normal value, 2^-126 - + // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1). const Packet is_denormal = pcmp_lt(pabs(a), normal_min); - EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24 + EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24 // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr. const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24 - const Packet normalization_factor = pset1(scalar_normalization_factor); + const Packet normalization_factor = pset1(scalar_normalization_factor); const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a); - + // Determine exponent offset: -126 if normal, -126-24 if denormal - const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126 + const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(ExponentBits-1)) - ScalarUI(2)); // -126 Packet exponent_offset = pset1(scalar_exponent_offset); const Packet normalization_offset = pset1(-Scalar(scalar_normalization_offset)); // -24 exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset); - + // Determine exponent and mantissa from normalized_a. exponent = pfrexp_generic_get_biased_exponent(normalized_a); // Zero, Inf and NaN return 'a' unmodified, exponent is zero // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero) - const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)); // 255 + const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1)); // 255 const Packet non_finite_exponent = pset1(scalar_non_finite_exponent); const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent)); const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half)); - exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset)); + exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset)); return m; } @@ -108,25 +109,24 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent) { typedef typename unpacket_traits::integer_packet PacketI; typedef typename unpacket_traits::type Scalar; typedef typename unpacket_traits::type ScalarI; - enum { + static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, - ExponentBits = int(TotalBits) - int(MantissaBits) - 1 - }; + ExponentBits = TotalBits - MantissaBits - 1; - const Packet max_exponent = pset1(Scalar((ScalarI(1)<((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)); // 127 + const Packet max_exponent = pset1(Scalar((ScalarI(1)<((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1)); // 127 const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); - Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b + Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) b = psub(psub(psub(e, b), b), b); // e - 3b - c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) + c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) out = pmul(out, c); return out; } -// Explicitly multiplies +// Explicitly multiplies // a * (2^e) // clamping e to the range // [NumTraits::min_exponent()-2, NumTraits::max_exponent()] @@ -140,20 +140,19 @@ struct pldexp_fast_impl { typedef typename unpacket_traits::integer_packet PacketI; typedef typename unpacket_traits::type Scalar; typedef typename unpacket_traits::type ScalarI; - enum { + static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits::digits - 1, - ExponentBits = int(TotalBits) - int(MantissaBits) - 1 - }; - + ExponentBits = TotalBits - MantissaBits - 1; + static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet run(const Packet& a, const Packet& exponent) { - const Packet bias = pset1(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1))); // 127 - const Packet limit = pset1(Scalar((ScalarI(1)<(Scalar((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1))); // 127 + const Packet limit = pset1(Scalar((ScalarI(1)<(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 // return a * (2^e) - return pmul(a, preinterpret(plogical_shift_left(e))); + return pmul(a, preinterpret(plogical_shift_left(e))); } }; @@ -165,36 +164,16 @@ struct pldexp_fast_impl { // polynomial interpolants -> ... -> profit! template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog_impl_float(const Packet _x) { - Packet x = _x; - const Packet cst_1 = pset1(1.0f); - const Packet cst_neg_half = pset1(-0.5f); - // The smallest non denormalized float number. - const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); - const Packet cst_minus_inf = pset1frombits( 0xff800000u); - const Packet cst_pos_inf = pset1frombits( 0x7f800000u); + const Packet cst_minus_inf = pset1frombits(static_cast(0xff800000u)); + const Packet cst_pos_inf = pset1frombits(static_cast(0x7f800000u)); - // Polynomial coefficients. const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); - const Packet cst_cephes_log_p0 = pset1(7.0376836292E-2f); - const Packet cst_cephes_log_p1 = pset1(-1.1514610310E-1f); - const Packet cst_cephes_log_p2 = pset1(1.1676998740E-1f); - const Packet cst_cephes_log_p3 = pset1(-1.2420140846E-1f); - const Packet cst_cephes_log_p4 = pset1(+1.4249322787E-1f); - const Packet cst_cephes_log_p5 = pset1(-1.6668057665E-1f); - const Packet cst_cephes_log_p6 = pset1(+2.0000714765E-1f); - const Packet cst_cephes_log_p7 = pset1(-2.4999993993E-1f); - const Packet cst_cephes_log_p8 = pset1(+3.3333331174E-1f); - - // Truncate input values to the minimum positive normal. - x = pmax(x, cst_min_norm_pos); - - Packet e; + Packet e, x; // extract significant in the range [0.5,1) and exponent - x = pfrexp(x,e); + x = pfrexp(_x,e); // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) // and shift by -1. The values are then centered around 0, which improves @@ -209,24 +188,22 @@ Packet plog_impl_float(const Packet _x) e = psub(e, pand(cst_1, mask)); x = padd(x, tmp); - Packet x2 = pmul(x, x); - Packet x3 = pmul(x2, x); + // Polynomial coefficients for rational (3,3) r(x) = p(x)/q(x) + // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1]. + const Packet cst_p1 = pset1(1.0000000190281136f); + const Packet cst_p2 = pset1(1.0000000190281063f); + const Packet cst_p3 = pset1(0.18256296349849254f); + const Packet cst_q1 = pset1(1.4999999999999927f); + const Packet cst_q2 = pset1(0.59923249590823520f); + const Packet cst_q3 = pset1(0.049616247954120038f); - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet y, y1, y2; - y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); - y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); - y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7); - y = pmadd(y, x, cst_cephes_log_p2); - y1 = pmadd(y1, x, cst_cephes_log_p5); - y2 = pmadd(y2, x, cst_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y = pmadd(cst_neg_half, x2, y); - x = padd(x, y); + Packet p = pmadd(x, cst_p3, cst_p2); + p = pmadd(x, p, cst_p1); + p = pmul(x, p); + Packet q = pmadd(x, cst_q3, cst_q2); + q = pmadd(x, q, cst_q1); + q = pmadd(x, q, cst_1); + x = pdiv(p, q); // Add the logarithm of the exponent back to the result of the interpolation. if (base2) { @@ -250,7 +227,6 @@ Packet plog_impl_float(const Packet _x) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog_float(const Packet _x) { return plog_impl_float(_x); @@ -258,7 +234,6 @@ Packet plog_float(const Packet _x) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog2_float(const Packet _x) { return plog_impl_float(_x); @@ -275,15 +250,12 @@ Packet plog2_float(const Packet _x) */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog_impl_double(const Packet _x) { Packet x = _x; const Packet cst_1 = pset1(1.0); const Packet cst_neg_half = pset1(-0.5); - // The smallest non denormalized double. - const Packet cst_min_norm_pos = pset1frombits( static_cast(0x0010000000000000ull)); const Packet cst_minus_inf = pset1frombits( static_cast(0xfff0000000000000ull)); const Packet cst_pos_inf = pset1frombits( static_cast(0x7ff0000000000000ull)); @@ -305,9 +277,6 @@ Packet plog_impl_double(const Packet _x) const Packet cst_cephes_log_q4 = pset1(7.11544750618563894466E1); const Packet cst_cephes_log_q5 = pset1(2.31251620126765340583E1); - // Truncate input values to the minimum positive normal. - x = pmax(x, cst_min_norm_pos); - Packet e; // extract significant in the range [0.5,1) and exponent x = pfrexp(x,e); @@ -371,7 +340,6 @@ Packet plog_impl_double(const Packet _x) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog_double(const Packet _x) { return plog_impl_double(_x); @@ -379,7 +347,6 @@ Packet plog_double(const Packet _x) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog2_double(const Packet _x) { return plog_impl_double(_x); @@ -433,26 +400,27 @@ Packet generic_expm1(const Packet& x) // Exponential function. Works by writing "x = m*log(2) + r" where // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +// exp(r) is computed using a 6th order minimax polynomial approximation. template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet pexp_float(const Packet _x) { - const Packet cst_1 = pset1(1.0f); + const Packet cst_zero = pset1(0.0f); + const Packet cst_one = pset1(1.0f); const Packet cst_half = pset1(0.5f); - const Packet cst_exp_hi = pset1( 88.723f); - const Packet cst_exp_lo = pset1(-88.723f); + const Packet cst_exp_hi = pset1(88.723f); + const Packet cst_exp_lo = pset1(-104.f); const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); - const Packet cst_cephes_exp_p0 = pset1(1.9875691500E-4f); - const Packet cst_cephes_exp_p1 = pset1(1.3981999507E-3f); - const Packet cst_cephes_exp_p2 = pset1(8.3334519073E-3f); - const Packet cst_cephes_exp_p3 = pset1(4.1665795894E-2f); - const Packet cst_cephes_exp_p4 = pset1(1.6666665459E-1f); - const Packet cst_cephes_exp_p5 = pset1(5.0000001201E-1f); + const Packet cst_p2 = pset1(0.49999988079071044921875f); + const Packet cst_p3 = pset1(0.16666518151760101318359375f); + const Packet cst_p4 = pset1(4.166965186595916748046875e-2f); + const Packet cst_p5 = pset1(8.36894474923610687255859375e-3f); + const Packet cst_p6 = pset1(1.37449637986719608306884765625e-3f); // Clamp x. - Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo); + Packet zero_mask = pcmp_lt(_x, cst_exp_lo); + Packet x = pmin(_x, cst_exp_hi); // Express exp(x) as exp(m*ln(2) + r), start by extracting // m = floor(x/ln(2) + 0.5). @@ -466,31 +434,27 @@ Packet pexp_float(const Packet _x) Packet r = pmadd(m, cst_cephes_exp_C1, x); r = pmadd(m, cst_cephes_exp_C2, r); - Packet r2 = pmul(r, r); - Packet r3 = pmul(r2, r); - - // Evaluate the polynomial approximant,improved by instruction-level parallelism. - Packet y, y1, y2; - y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1); - y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4); - y2 = padd(r, cst_1); - y = pmadd(y, r, cst_cephes_exp_p2); - y1 = pmadd(y1, r, cst_cephes_exp_p5); - y = pmadd(y, r3, y1); - y = pmadd(y, r2, y2); + // Evaluate the 6th order polynomial approximation to exp(r) + // with r in the interval [-ln(2)/2;ln(2)/2]. + const Packet r2 = pmul(r, r); + Packet p_even = pmadd(r2, cst_p6, cst_p4); + const Packet p_odd = pmadd(r2, cst_p5, cst_p3); + p_even = pmadd(r2, p_even, cst_p2); + const Packet p_low = padd(r, cst_one); + Packet y = pmadd(r, p_odd, p_even); + y = pmadd(r2, y, p_low); // Return 2^m * exp(r). // TODO: replace pldexp with faster implementation since y in [-1, 1). - return pmax(pldexp(y,m), _x); + return pselect(zero_mask, cst_zero, pmax(pldexp(y,m), _x)); } template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet pexp_double(const Packet _x) { Packet x = _x; - + const Packet cst_zero = pset1(0.0); const Packet cst_1 = pset1(1.0); const Packet cst_2 = pset1(2.0); const Packet cst_half = pset1(0.5); @@ -512,7 +476,8 @@ Packet pexp_double(const Packet _x) Packet tmp, fx; // clamp x - x = pmax(pmin(x, cst_exp_hi), cst_exp_lo); + Packet zero_mask = pcmp_lt(_x, cst_exp_lo); + x = pmin(x, cst_exp_hi); // Express exp(x) as exp(g + n*log(2)). fx = pmadd(cst_cephes_LOG2EF, x, cst_half); @@ -550,7 +515,7 @@ Packet pexp_double(const Packet _x) // Construct the result 2^n * exp(g) = e * x. The max is used to catch // non-finite values in the input. // TODO: replace pldexp with faster implementation since x in [-1, 1). - return pmax(pldexp(x,fx), _x); + return pselect(zero_mask, cst_zero, pmax(pldexp(x,fx), _x)); } // The following code is inspired by the following stack-overflow answer: @@ -562,7 +527,7 @@ Packet pexp_double(const Packet _x) // aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. // - Avoid a branch in rounding and extraction of the remaining fractional part. // Overall, I measured a speed up higher than x2 on x86-64. -inline float trig_reduce_huge (float xf, int *quadrant) +inline float trig_reduce_huge (float xf, Eigen::numext::int32_t *quadrant) { using Eigen::numext::int32_t; using Eigen::numext::uint32_t; @@ -570,7 +535,7 @@ inline float trig_reduce_huge (float xf, int *quadrant) using Eigen::numext::uint64_t; const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62 - const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt + const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format // 192 bits of 2/pi for Payne-Hanek reduction // Bits are introduced by packet of 8 to enable aligned reads. @@ -618,8 +583,7 @@ inline float trig_reduce_huge (float xf, int *quadrant) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT +#if EIGEN_COMP_GNUC_STRICT __attribute__((optimize("-fno-unsafe-math-optimizations"))) #endif Packet psincos_float(const Packet& _x) @@ -629,20 +593,20 @@ Packet psincos_float(const Packet& _x) const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding const PacketI csti_1 = pset1(1); - const Packet cst_sign_mask = pset1frombits(0x80000000u); + const Packet cst_sign_mask = pset1frombits(static_cast(0x80000000u)); Packet x = pabs(_x); // Scale x by 2/Pi to find x's octant. Packet y = pmul(x, cst_2oPI); - // Rounding trick: + // Rounding trick to find nearest integer: Packet y_round = padd(y, cst_rounding_magic); EIGEN_OPTIMIZATION_BARRIER(y_round) PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) - y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi + y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi) - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4 // using "Extended precision modular arithmetic" #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) // This version requires true FMA for high accuracy @@ -685,7 +649,7 @@ Packet psincos_float(const Packet& _x) const int PacketSize = unpacket_traits::size; EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize]; - EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize]; pstoreu(vals, pabs(_x)); pstoreu(x_cpy, x); pstoreu(y_int2, y_int); @@ -743,7 +707,6 @@ Packet psincos_float(const Packet& _x) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet psin_float(const Packet& x) { return psincos_float(x); @@ -751,16 +714,268 @@ Packet psin_float(const Packet& x) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet pcos_float(const Packet& x) { return psincos_float(x); } +// Generic implementation of acos(x). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pacos_float(const Packet& x_in) { + typedef typename unpacket_traits::type Scalar; + static_assert(std::is_same::value, "Scalar type must be float"); + + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_pi = pset1(Scalar(EIGEN_PI)); + const Packet p6 = pset1(Scalar(2.26911413483321666717529296875e-3)); + const Packet p5 = pset1(Scalar(-1.1063250713050365447998046875e-2)); + const Packet p4 = pset1(Scalar(2.680264413356781005859375e-2)); + const Packet p3 = pset1(Scalar(-4.87488098442554473876953125e-2)); + const Packet p2 = pset1(Scalar(8.874166011810302734375e-2)); + const Packet p1 = pset1(Scalar(-0.2145837843418121337890625)); + const Packet p0 = pset1(Scalar(1.57079613208770751953125)); + + // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth + // function, by a 6'th order polynomial. + // For x in [-1:0) we use that acos(-x) = pi - acos(x). + const Packet neg_mask = pcmp_lt(x_in, pzero(x_in)); + Packet x = pabs(x_in); + const Packet invalid_mask = pcmp_lt(pset1(1.0f), x); + + // Evaluate the polynomial using Horner's rule: + // P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) . + // We evaluate even and odd terms independently to increase + // instruction level parallelism. + Packet x2 = pmul(x_in,x_in); + Packet p_even = pmadd(p6, x2, p4); + Packet p_odd = pmadd(p5, x2, p3); + p_even = pmadd(p_even, x2, p2); + p_odd = pmadd(p_odd, x2, p1); + p_even = pmadd(p_even, x2, p0); + Packet p = pmadd(p_odd, x, p_even); + + // The polynomial approximates acos(x)/sqrt(1-x), so + // multiply by sqrt(1-x) to get acos(x). + Packet denom = psqrt(psub(cst_one, x)); + Packet result = pmul(denom, p); + + // Undo mapping for negative arguments. + result = pselect(neg_mask, psub(cst_pi, result), result); + // Return NaN for arguments outside [-1:1]. + return pselect(invalid_mask, + pset1(std::numeric_limits::quiet_NaN()), + result); +} + +// Generic implementation of asin(x). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pasin_float(const Packet& x_in) { + typedef typename unpacket_traits::type Scalar; + static_assert(std::is_same::value, "Scalar type must be float"); + + // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with + // even terms only. + const Packet p9 = pset1(Scalar(5.08838854730129241943359375e-2f)); + const Packet p7 = pset1(Scalar(3.95139865577220916748046875e-2f)); + const Packet p5 = pset1(Scalar(7.550220191478729248046875e-2f)); + const Packet p3 = pset1(Scalar(0.16664917767047882080078125f)); + const Packet p1 = pset1(Scalar(1.00000011920928955078125f)); + + const Packet neg_mask = pcmp_lt(x_in, pzero(x_in)); + Packet x = pabs(x_in); + const Packet invalid_mask = pcmp_lt(pset1(1.0f), x); + // For arguments |x| > 0.5, we map x back to [0:0.5] using + // the transformation x_large = sqrt(0.5*(1-x)), and use the + // identity + // asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x))) + const Packet cst_half = pset1(Scalar(0.5f)); + const Packet cst_two = pset1(Scalar(2)); + Packet x_large = psqrt(pnmadd(cst_half, x, cst_half)); + const Packet large_mask = pcmp_lt(cst_half, x); + x = pselect(large_mask, x_large, x); + + // Compute polynomial. + // x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9)))) + Packet x2 = pmul(x, x); + Packet p = pmadd(p9, x2, p7); + p = pmadd(p, x2, p5); + p = pmadd(p, x2, p3); + p = pmadd(p, x2, p1); + p = pmul(p, x); + + constexpr float kPiOverTwo = static_cast(EIGEN_PI/2); + Packet p_large = pnmadd(cst_two, p, pset1(kPiOverTwo)); + p = pselect(large_mask, p_large, p); + // Flip the sign for negative arguments. + p = pselect(neg_mask, pnegate(p), p); + + // Return NaN for arguments outside [-1:1]. + return pselect(invalid_mask, pset1(std::numeric_limits::quiet_NaN()), p); +} + +// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy. +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan_reduced_float(const Packet& x) { + const Packet q0 = pset1(-0.3333314359188079833984375f); + const Packet q2 = pset1(0.19993579387664794921875f); + const Packet q4 = pset1(-0.14209578931331634521484375f); + const Packet q6 = pset1(0.1066047251224517822265625f); + const Packet q8 = pset1(-7.5408883392810821533203125e-2f); + const Packet q10 = pset1(4.3082617223262786865234375e-2f); + const Packet q12 = pset1(-1.62907354533672332763671875e-2f); + const Packet q14 = pset1(2.90188402868807315826416015625e-3f); + + // Approximate atan(x) by a polynomial of the form + // P(x) = x + x^3 * Q(x^2), + // where Q(x^2) is a 7th order polynomial in x^2. + // We evaluate even and odd terms in x^2 in parallel + // to take advantage of instruction level parallelism + // and hardware with multiple FMA units. + const Packet x2 = pmul(x, x); + const Packet x4 = pmul(x2, x2); + Packet q_odd = pmadd(q14, x4, q10); + Packet q_even = pmadd(q12, x4, q8); + q_odd = pmadd(q_odd, x4, q6); + q_even = pmadd(q_even, x4, q4); + q_odd = pmadd(q_odd, x4, q2); + q_even = pmadd(q_even, x4, q0); + const Packet q = pmadd(q_odd, x2, q_even); + return pmadd(q, pmul(x, x2), x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan_float(const Packet& x_in) { + typedef typename unpacket_traits::type Scalar; + static_assert(std::is_same::value, "Scalar type must be float"); + + const Packet cst_one = pset1(1.0f); + constexpr float kPiOverTwo = static_cast(EIGEN_PI/2); + + // "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x). + // "Small": For |x| <= 1, approximate atan(x) directly by a polynomial + // calculated using Sollya. + const Packet neg_mask = pcmp_lt(x_in, pzero(x_in)); + const Packet large_mask = pcmp_lt(cst_one, pabs(x_in)); + const Packet large_shift = pselect(neg_mask, pset1(-kPiOverTwo), pset1(kPiOverTwo)); + const Packet x = pselect(large_mask, preciprocal(x_in), x_in); + const Packet p = patan_reduced_float(x); + + // Apply transformations according to the range reduction masks. + return pselect(large_mask, psub(large_shift, p), p); +} + +// Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)] +// with 2 ulp accuracy. +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet +patan_reduced_double(const Packet& x) { + const Packet q0 = + pset1(-0.33333333333330028569463365784031338989734649658203); + const Packet q2 = + pset1(0.199999999990664090177006073645316064357757568359375); + const Packet q4 = + pset1(-0.142857141937123677255527809393242932856082916259766); + const Packet q6 = + pset1(0.111111065991039953404495577160560060292482376098633); + const Packet q8 = + pset1(-9.0907812986129224452902519715280504897236824035645e-2); + const Packet q10 = + pset1(7.6900542950704739442180368769186316058039665222168e-2); + const Packet q12 = + pset1(-6.6410112986494976294871150912513257935643196105957e-2); + const Packet q14 = + pset1(5.6920144995467943094258345126945641823112964630127e-2); + const Packet q16 = + pset1(-4.3577020814990513608577771265117917209863662719727e-2); + const Packet q18 = + pset1(2.1244050233624342527427586446719942614436149597168e-2); + + // Approximate atan(x) on [0:tan(pi/8)] by a polynomial of the form + // P(x) = x + x^3 * Q(x^2), + // where Q(x^2) is a 9th order polynomial in x^2. + // We evaluate even and odd terms in x^2 in parallel + // to take advantage of instruction level parallelism + // and hardware with multiple FMA units. + const Packet x2 = pmul(x, x); + const Packet x4 = pmul(x2, x2); + Packet q_odd = pmadd(q18, x4, q14); + Packet q_even = pmadd(q16, x4, q12); + q_odd = pmadd(q_odd, x4, q10); + q_even = pmadd(q_even, x4, q8); + q_odd = pmadd(q_odd, x4, q6); + q_even = pmadd(q_even, x4, q4); + q_odd = pmadd(q_odd, x4, q2); + q_even = pmadd(q_even, x4, q0); + const Packet p = pmadd(q_odd, x2, q_even); + return pmadd(p, pmul(x, x2), x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan_double(const Packet& x_in) { + typedef typename unpacket_traits::type Scalar; + static_assert(std::is_same::value, "Scalar type must be double"); + + const Packet cst_one = pset1(1.0); + constexpr double kPiOverTwo = static_cast(EIGEN_PI / 2); + const Packet cst_pi_over_two = pset1(kPiOverTwo); + constexpr double kPiOverFour = static_cast(EIGEN_PI / 4); + const Packet cst_pi_over_four = pset1(kPiOverFour); + const Packet cst_large = pset1(2.4142135623730950488016887); // tan(3*pi/8); + const Packet cst_medium = pset1(0.4142135623730950488016887); // tan(pi/8); + + const Packet neg_mask = pcmp_lt(x_in, pzero(x_in)); + Packet x = pabs(x_in); + + // Use the same range reduction strategy (to [0:tan(pi/8)]) as the + // Cephes library: + // "Large": For x >= tan(3*pi/8), use atan(1/x) = pi/2 - atan(x). + // "Medium": For x in [tan(pi/8) : tan(3*pi/8)), + // use atan(x) = pi/4 + atan((x-1)/(x+1)). + // "Small": For x < tan(pi/8), approximate atan(x) directly by a polynomial + // calculated using Sollya. + const Packet large_mask = pcmp_lt(cst_large, x); + x = pselect(large_mask, preciprocal(x), x); + const Packet medium_mask = pandnot(pcmp_lt(cst_medium, x), large_mask); + x = pselect(medium_mask, pdiv(psub(x, cst_one), padd(x, cst_one)), x); + + // Compute approximation of p ~= atan(x') where x' is the argument reduced to + // [0:tan(pi/8)]. + Packet p = patan_reduced_double(x); + + // Apply transformations according to the range reduction masks. + p = pselect(large_mask, psub(cst_pi_over_two, p), p); + p = pselect(medium_mask, padd(cst_pi_over_four, p), p); + return pselect(neg_mask, pnegate(p), p); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdiv_complex(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::as_real RealPacket; + // In the following we annotate the code for the case where the inputs + // are a pair length-2 SIMD vectors representing a single pair of complex + // numbers x = a + i*b, y = c + i*d. + const RealPacket y_abs = pabs(y.v); // |c|, |d| + const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c| + const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|) + const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|) + // Compute scaled denominator. + const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2 + const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v); + Packet result_scaled = pmul(x, pconj(Packet(y_scaled))); // a * c' + b * d', -a * d + b * c + // Divide elementwise by denom. + result_scaled = Packet(pdiv(result_scaled.v, denom)); + // Rescale result + return Packet(pdiv(result_scaled.v, y_max)); +} template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet psqrt_complex(const Packet& a) { typedef typename unpacket_traits::type Scalar; typedef typename Scalar::value_type RealScalar; @@ -832,8 +1047,8 @@ Packet psqrt_complex(const Packet& a) { // Step 4. Compute solution for inputs with negative real part: // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1] - const RealScalar neg_zero = RealScalar(numext::bit_cast(0x80000000u)); - const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), neg_zero)).v; + const RealPacket cst_imag_sign_mask = + pset1(Scalar(RealScalar(0.0), RealScalar(-0.0))).v; RealPacket imag_signs = pand(a.v, cst_imag_sign_mask); Packet negative_real_result; // Notice that rho is positive, so taking it's absolute value is a noop. @@ -871,6 +1086,98 @@ Packet psqrt_complex(const Packet& a) { pselect(is_real_inf, real_inf_result,result)); } + +template +struct psign_impl< + Packet, + std::enable_if_t< + !NumTraits::type>::IsComplex && + !NumTraits::type>::IsInteger>> { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) { + using Scalar = typename unpacket_traits::type; + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_minus_one = pset1(Scalar(-1)); + const Packet cst_zero = pzero(a); + + const Packet not_nan_mask = pcmp_eq(a, a); + const Packet positive_mask = pcmp_lt(cst_zero, a); + const Packet positive = pand(positive_mask, cst_one); + const Packet negative_mask = pcmp_lt(a, cst_zero); + const Packet negative = pand(negative_mask, cst_minus_one); + + return pselect(not_nan_mask, por(positive, negative), a); + } +}; + +template +struct psign_impl< + Packet, std::enable_if_t< + !NumTraits::type>::IsComplex && + NumTraits::type>::IsSigned && + NumTraits::type>::IsInteger>> { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) { + using Scalar = typename unpacket_traits::type; + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_minus_one = pset1(Scalar(-1)); + const Packet cst_zero = pzero(a); + + const Packet positive_mask = pcmp_lt(cst_zero, a); + const Packet positive = pand(positive_mask, cst_one); + const Packet negative_mask = pcmp_lt(a, cst_zero); + const Packet negative = pand(negative_mask, cst_minus_one); + + return por(positive, negative); + } +}; + +template +struct psign_impl::type>::IsComplex && + !NumTraits::type>::IsSigned && + NumTraits::type>::IsInteger>> { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) { + using Scalar = typename unpacket_traits::type; + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_zero = pzero(a); + + const Packet zero_mask = pcmp_eq(cst_zero, a); + return pandnot(cst_one, zero_mask); + } +}; + +// \internal \returns the the sign of a complex number z, defined as z / abs(z). +template +struct psign_impl::type>::IsComplex && + unpacket_traits::vectorizable>> { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename unpacket_traits::as_real RealPacket; + + // Step 1. Compute (for each element z = x + i*y in a) + // l = abs(z) = sqrt(x^2 + y^2). + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l = (zmin == 0 ? zmax : zmax * sqrt(1 + (zmin/zmax)**2)), + // where zmax = max(|x|, |y|), zmin = min(|x|, |y|), + RealPacket a_abs = pabs(a.v); + RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; + RealPacket a_max = pmax(a_abs, a_abs_flip); + RealPacket a_min = pmin(a_abs, a_abs_flip); + RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); + RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max)); + RealPacket r = pdiv(a_min, a_max); + const RealPacket cst_one = pset1(RealScalar(1)); + RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1] + // Set l to a_max if a_min is zero, since the roundtrip sqrt(a_max^2) may be + // lossy. + l = pselect(a_min_zero_mask, a_max, l); + // Step 2 compute a / abs(a). + RealPacket sign_as_real = pandnot(pdiv(a.v, l), a_max_zero_mask); + Packet sign; + sign.v = sign_as_real; + return sign; + } +}; + // TODO(rmlarsen): The following set of utilities for double word arithmetic // should perhaps be refactored as a separate file, since it would be generally // useful for special function implementation etc. Writing the algorithms in @@ -1040,32 +1347,23 @@ void twoprod(const Packet& x_hi, const Packet& x_lo, fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo); } -// This function computes the reciprocal of a floating point number -// with extra precision and returns the result as a double word. +// This function implements the division of double word {x_hi, x_lo} +// by float y. This is Algorithm 15 from "Tight and rigourous error bounds +// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu, +// 2017. https://hal.archives-ouvertes.fr/hal-01351529 template -void doubleword_reciprocal(const Packet& x, Packet& recip_hi, Packet& recip_lo) { - typedef typename unpacket_traits::type Scalar; - // 1. Approximate the reciprocal as the reciprocal of the high order element. - Packet approx_recip = prsqrt(x); - approx_recip = pmul(approx_recip, approx_recip); - - // 2. Run one step of Newton-Raphson iteration in double word arithmetic - // to get the bottom half. The NR iteration for reciprocal of 'a' is - // x_{i+1} = x_i * (2 - a * x_i) - - // -a*x_i - Packet t1_hi, t1_lo; - twoprod(pnegate(x), approx_recip, t1_hi, t1_lo); - // 2 - a*x_i - Packet t2_hi, t2_lo; - fast_twosum(pset1(Scalar(2)), t1_hi, t2_hi, t2_lo); - Packet t3_hi, t3_lo; - fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo); - // x_i * (2 - a * x_i) - twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo); +void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y, + Packet& z_hi, Packet& z_lo) { + const Packet t_hi = pdiv(x_hi, y); + Packet pi_hi, pi_lo; + twoprod(t_hi, y, pi_hi, pi_lo); + const Packet delta_hi = psub(x_hi, pi_hi); + const Packet delta_t = psub(delta_hi, pi_lo); + const Packet delta = padd(delta_t, x_lo); + const Packet t_lo = pdiv(delta, y); + fast_twosum(t_hi, t_lo, z_hi, z_lo); } - // This function computes log2(x) and returns the result as a double word. template struct accurate_log2 { @@ -1204,16 +1502,13 @@ struct accurate_log2 { const Packet cst_2_log2e_hi = pset1(2.88539008177792677); const Packet cst_2_log2e_lo = pset1(4.07660016854549667e-17); // c * (x - 1) - Packet num_hi, num_lo; - twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), num_hi, num_lo); - // TODO(rmlarsen): Investigate if using the division algorithm by - // Muller et al. is faster/more accurate. - // 1 / (x + 1) - Packet denom_hi, denom_lo; - doubleword_reciprocal(padd(x, one), denom_hi, denom_lo); - // r = c * (x-1) / (x+1), + Packet t_hi, t_lo; + // t = c * (x-1) + twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo); + // r = c * (x-1) / (x+1), Packet r_hi, r_lo; - twoprod(num_hi, num_lo, denom_hi, denom_lo, r_hi, r_lo); + doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo); + // r2 = r * r Packet r2_hi, r2_lo; twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo); @@ -1443,39 +1738,40 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { } // Generic implementation of pow(x,y). -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet generic_pow(const Packet& x, const Packet& y) { +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) { typedef typename unpacket_traits::type Scalar; const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_neg_inf = pset1(-NumTraits::infinity()); const Packet cst_zero = pset1(Scalar(0)); const Packet cst_one = pset1(Scalar(1)); const Packet cst_nan = pset1(NumTraits::quiet_NaN()); const Packet abs_x = pabs(x); // Predicates for sign and magnitude of x. - const Packet x_is_zero = pcmp_eq(x, cst_zero); - const Packet x_is_neg = pcmp_lt(x, cst_zero); + const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero); + const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf); + const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero); + const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero); const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); - const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); - const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); - const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); // Predicates for sign and magnitude of y. + const Packet abs_y = pabs(y); const Packet y_is_one = pcmp_eq(y, cst_one); - const Packet y_is_zero = pcmp_eq(y, cst_zero); + const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero); const Packet y_is_neg = pcmp_lt(y, cst_zero); - const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg)); const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); - const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf); EIGEN_CONSTEXPR Scalar huge_exponent = - (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / - NumTraits::epsilon(); + (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits::epsilon(); const Packet abs_y_is_huge = pcmp_le(pset1(huge_exponent), pabs(y)); // Predicates for whether y is integer and/or even. @@ -1484,39 +1780,31 @@ Packet generic_pow(const Packet& x, const Packet& y) { const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); // Predicates encoding special cases for the value of pow(x,y) - const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), - y_is_int), - abs_y_is_inf); - const Packet pow_is_one = por(por(x_is_one, y_is_zero), - pand(x_is_neg_one, - por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf); const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); - const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), - pand(abs_x_is_inf, y_is_neg)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_pos)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_neg)); - const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), - pand(abs_x_is_inf, y_is_pos)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_neg)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_pos)); + const Packet pow_is_one = + por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg)); + const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos)); + const Packet inf_val = + pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even), + cst_neg_inf, cst_pos_inf); // General computation of pow(x,y) for positive x or negative x and integer y. const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); const Packet pow_abs = generic_pow_impl(abs_x, y); - return pselect(y_is_one, x, - pselect(pow_is_one, cst_one, - pselect(pow_is_nan, cst_nan, - pselect(pow_is_inf, cst_pos_inf, - pselect(pow_is_zero, cst_zero, - pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); + return pselect( + y_is_one, x, + pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, inf_val, + pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); } - - /* polevl (modified for Eigen) * * Evaluate polynomial @@ -1643,6 +1931,267 @@ struct pchebevl { } }; +namespace unary_pow { +template ::IsInteger> +struct is_odd { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent run(const ScalarExponent& x) { + ScalarExponent xdiv2 = x / ScalarExponent(2); + ScalarExponent floorxdiv2 = numext::floor(xdiv2); + return xdiv2 != floorxdiv2; + } +}; +template +struct is_odd { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent run(const ScalarExponent& x) { + return x % ScalarExponent(2); + } +}; + +template ::type>::IsInteger> +struct do_div { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) { + typedef typename unpacket_traits::type Scalar; + const Packet cst_pos_one = pset1(Scalar(1)); + return exponent < 0 ? pdiv(cst_pos_one, x) : x; + } +}; + +template +struct do_div { + // pdiv not defined, nor necessary for integer base types + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) { + EIGEN_UNUSED_VARIABLE(exponent); + return x; + } +}; + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) { + typedef typename unpacket_traits::type Scalar; + const Packet cst_pos_one = pset1(Scalar(1)); + if (exponent == 0) return cst_pos_one; + Packet result = x; + Packet y = cst_pos_one; + ScalarExponent m = numext::abs(exponent); + while (m > 1) { + bool odd = is_odd::run(m); + if (odd) y = pmul(y, result); + result = pmul(result, result); + m = numext::floor(m / ScalarExponent(2)); + } + result = pmul(y, result); + result = do_div::run(result, exponent); + return result; +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x, + const typename unpacket_traits::type& exponent) { + const Packet exponent_packet = pset1(exponent); + return generic_pow_impl(x, exponent_packet); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_int_errors(const Packet& x, const Packet& powx, + const ScalarExponent& exponent) { + typedef typename unpacket_traits::type Scalar; + + // non-integer base, integer exponent case + + const bool exponent_is_odd = is_odd::run(exponent); + const bool exponent_is_neg = exponent < 0; + + const Packet exp_is_odd = exponent_is_odd ? ptrue(x) : pzero(x); + const Packet exp_is_neg = exponent_is_neg ? ptrue(x) : pzero(x); + + const Scalar pos_zero = Scalar(0); + const Scalar neg_zero = -Scalar(0); + const Scalar pos_one = Scalar(1); + const Scalar pos_inf = NumTraits::infinity(); + const Scalar neg_inf = -NumTraits::infinity(); + + const Packet cst_pos_zero = pset1(pos_zero); + const Packet cst_neg_zero = pset1(neg_zero); + const Packet cst_pos_one = pset1(pos_one); + const Packet cst_pos_inf = pset1(pos_inf); + const Packet cst_neg_inf = pset1(neg_inf); + + const Packet abs_x = pabs(x); + const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_pos_zero); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one); + const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); + + const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf); + const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero); + const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero); + + if (exponent == 0) { + return cst_pos_one; + } + + Packet pow_is_pos_inf = pand(pandnot(abs_x_is_zero, x_is_neg_zero), pand(exp_is_odd, exp_is_neg)); + pow_is_pos_inf = por(pow_is_pos_inf, pand(abs_x_is_zero, pandnot(exp_is_neg, exp_is_odd))); + pow_is_pos_inf = por(pow_is_pos_inf, pand(pand(abs_x_is_inf, x_is_neg), pandnot(pnot(exp_is_neg), exp_is_odd))); + pow_is_pos_inf = por(pow_is_pos_inf, pandnot(pandnot(abs_x_is_inf, x_is_neg), exp_is_neg)); + + Packet pow_is_neg_inf = pand(x_is_neg_zero, pand(exp_is_neg, exp_is_odd)); + pow_is_neg_inf = por(pow_is_neg_inf, pand(pand(abs_x_is_inf, x_is_neg), pandnot(exp_is_odd, exp_is_neg))); + + Packet pow_is_pos_zero = pandnot(abs_x_is_zero, exp_is_neg); + pow_is_pos_zero = por(pow_is_pos_zero, pand(pand(abs_x_is_inf, x_is_neg), pandnot(exp_is_neg, exp_is_odd))); + pow_is_pos_zero = por(pow_is_pos_zero, pand(pandnot(abs_x_is_inf, x_is_neg), exp_is_neg)); + + Packet pow_is_neg_zero = pand(x_is_neg_zero, pandnot(exp_is_odd, exp_is_neg)); + pow_is_neg_zero = por(pow_is_neg_zero, pand(pand(abs_x_is_inf, x_is_neg), pand(exp_is_odd, exp_is_neg))); + + Packet result = pselect(pow_is_neg_inf, cst_neg_inf, powx); + result = pselect(pow_is_neg_zero, cst_neg_zero, result); + result = pselect(pow_is_pos_zero, cst_pos_zero, result); + result = pselect(pow_is_pos_inf, cst_pos_inf, result); + result = pselect(pandnot(abs_x_is_one, x_is_neg), cst_pos_one, result); + return result; +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx, + const ScalarExponent& exponent) { + typedef typename unpacket_traits::type Scalar; + + // non-integer base and exponent case + + const bool exponent_is_fin = (numext::isfinite)(exponent); + const bool exponent_is_nan = (numext::isnan)(exponent); + const bool exponent_is_neg = exponent < 0; + const bool exponent_is_inf = !exponent_is_fin && !exponent_is_nan; + + const Packet exp_is_neg = exponent_is_neg ? ptrue(x) : pzero(x); + const Packet exp_is_inf = exponent_is_inf ? ptrue(x) : pzero(x); + + const Scalar pos_zero = Scalar(0); + const Scalar pos_one = Scalar(1); + const Scalar pos_inf = NumTraits::infinity(); + const Scalar neg_inf = -NumTraits::infinity(); + const Scalar nan = NumTraits::quiet_NaN(); + + const Packet cst_pos_zero = pset1(pos_zero); + const Packet cst_pos_one = pset1(pos_one); + const Packet cst_pos_inf = pset1(pos_inf); + const Packet cst_neg_inf = pset1(neg_inf); + const Packet cst_nan = pset1(nan); + + const Packet abs_x = pabs(x); + const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_pos_zero); + const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_pos_one); + const Packet abs_x_is_gt_one = pcmp_lt(cst_pos_one, abs_x); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one); + const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); + + const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf); + const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero); + + if (exponent_is_nan) { + return pselect(pandnot(abs_x_is_one, x_is_neg), cst_pos_one, cst_nan); + } + + Packet pow_is_pos_zero = pandnot(abs_x_is_zero, exp_is_neg); + pow_is_pos_zero = por(pow_is_pos_zero, pand(abs_x_is_gt_one, pand(exp_is_inf, exp_is_neg))); + pow_is_pos_zero = por(pow_is_pos_zero, pand(abs_x_is_lt_one, pandnot(exp_is_inf, exp_is_neg))); + pow_is_pos_zero = por(pow_is_pos_zero, pand(abs_x_is_inf, exp_is_neg)); + + const Packet pow_is_pos_one = pand(abs_x_is_one, exp_is_inf); + + Packet pow_is_pos_inf = pand(abs_x_is_zero, exp_is_neg); + pow_is_pos_inf = por(pow_is_pos_inf, pand(abs_x_is_lt_one, pand(exp_is_inf, exp_is_neg))); + pow_is_pos_inf = por(pow_is_pos_inf, pand(abs_x_is_gt_one, pandnot(exp_is_inf, exp_is_neg))); + pow_is_pos_inf = por(pow_is_pos_inf, pandnot(abs_x_is_inf, exp_is_neg)); + + const Packet pow_is_nan = pandnot(pandnot(x_is_neg, abs_x_is_inf), exp_is_inf); + + Packet result = pselect(pow_is_pos_inf, cst_pos_inf, powx); + result = pselect(pow_is_pos_one, cst_pos_one, result); + result = pselect(pow_is_pos_zero, cst_pos_zero, result); + result = pselect(pow_is_nan, cst_nan, result); + result = pselect(pandnot(abs_x_is_one, x_is_neg), cst_pos_one, result); + return result; +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_int_int(const Packet& x, const ScalarExponent& exponent) { + typedef typename unpacket_traits::type Scalar; + + // integer base, integer exponent case + + // This routine handles negative and very large positive exponents + // Signed integer overflow and divide by zero is undefined behavior + // Unsigned intgers do not overflow + + const bool exponent_is_odd = unary_pow::is_odd::run(exponent); + + const Scalar zero = Scalar(0); + const Scalar pos_one = Scalar(1); + + const Packet cst_zero = pset1(zero); + const Packet cst_pos_one = pset1(pos_one); + + const Packet abs_x = pabs(x); + + const Packet pow_is_zero = exponent < 0 ? pcmp_lt(cst_pos_one, abs_x) : pzero(x); + const Packet pow_is_one = pcmp_eq(cst_pos_one, abs_x); + const Packet pow_is_neg = exponent_is_odd ? pcmp_lt(x, cst_zero) : pzero(x); + + Packet result = pselect(pow_is_zero, cst_zero, x); + result = pselect(pandnot(pow_is_one, pow_is_neg), cst_pos_one, result); + result = pselect(pand(pow_is_one, pow_is_neg), pnegate(cst_pos_one), result); + return result; +} +} // end namespace unary_pow + +template ::type>::IsInteger, + bool ExponentIsIntegerType = NumTraits::IsInteger> +struct unary_pow_impl; + +template +struct unary_pow_impl { + typedef typename unpacket_traits::type Scalar; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) { + const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent; + if (exponent_is_integer) { + Packet result = unary_pow::int_pow(x, exponent); + result = unary_pow::handle_nonint_int_errors(x, result, exponent); + return result; + } else { + Packet result = unary_pow::gen_pow(x, exponent); + result = unary_pow::handle_nonint_nonint_errors(x, result, exponent); + return result; + } + } +}; + +template +struct unary_pow_impl { + typedef typename unpacket_traits::type Scalar; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) { + Packet result = unary_pow::int_pow(x, exponent); + result = unary_pow::handle_nonint_int_errors(x, result, exponent); + return result; + } +}; + +template +struct unary_pow_impl { + typedef typename unpacket_traits::type Scalar; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) { + if (exponent < 0 || exponent > NumTraits::digits()) { + return unary_pow::handle_int_int(x, exponent); + } + else { + return unary_pow::int_pow(x, exponent); + } + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 177a04e..179c55c 100644 --- a/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/libs/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -42,25 +44,21 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent); /** \internal \returns log(x) for single precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog_float(const Packet _x); /** \internal \returns log2(x) for single precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog2_float(const Packet _x); /** \internal \returns log(x) for single precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog_double(const Packet _x); /** \internal \returns log2(x) for single precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet plog2_double(const Packet _x); /** \internal \returns log(1 + x) */ @@ -74,33 +72,53 @@ Packet generic_expm1(const Packet& x); /** \internal \returns exp(x) for single precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet pexp_float(const Packet _x); /** \internal \returns exp(x) for double precision real numbers */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet pexp_double(const Packet _x); /** \internal \returns sin(x) for single precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet psin_float(const Packet& x); /** \internal \returns cos(x) for single precision float */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet pcos_float(const Packet& x); +/** \internal \returns asin(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pasin_float(const Packet& x); + +/** \internal \returns acos(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pacos_float(const Packet& x); + +/** \internal \returns atan(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan_float(const Packet& x); + +/** \internal \returns atan(x) for double precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan_double(const Packet& x); + /** \internal \returns sqrt(x) for complex types */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED Packet psqrt_complex(const Packet& a); +/** \internal \returns x / y for complex types */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdiv_complex(const Packet& x, const Packet& y); + template struct ppolevl; diff --git a/libs/eigen/Eigen/src/Core/arch/Default/Half.h b/libs/eigen/Eigen/src/Core/arch/Default/Half.h index 9f8e8cc..75d6228 100644 --- a/libs/eigen/Eigen/src/Core/arch/Default/Half.h +++ b/libs/eigen/Eigen/src/Core/arch/Default/Half.h @@ -36,7 +36,7 @@ #ifndef EIGEN_HALF_H #define EIGEN_HALF_H -#include +#include "../../InternalHeaderCheck.h" #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) // When compiling with GPU support, the "__half_raw" base class as well as @@ -202,57 +202,113 @@ struct half : public half_impl::half_base { #endif }; -} // end namespace Eigen +// TODO(majnemer): Get rid of this once we can rely on C++17 inline variables do +// solve the ODR issue. +namespace half_impl { +template +struct numeric_limits_half_impl { + static EIGEN_CONSTEXPR const bool is_specialized = true; + static EIGEN_CONSTEXPR const bool is_signed = true; + static EIGEN_CONSTEXPR const bool is_integer = false; + static EIGEN_CONSTEXPR const bool is_exact = false; + static EIGEN_CONSTEXPR const bool has_infinity = true; + static EIGEN_CONSTEXPR const bool has_quiet_NaN = true; + static EIGEN_CONSTEXPR const bool has_signaling_NaN = true; + static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present; + static EIGEN_CONSTEXPR const bool has_denorm_loss = false; + static EIGEN_CONSTEXPR const std::float_round_style round_style = std::round_to_nearest; + static EIGEN_CONSTEXPR const bool is_iec559 = true; + // The C++ standard defines this as "true if the set of values representable + // by the type is finite." Half has finite precision. + static EIGEN_CONSTEXPR const bool is_bounded = true; + static EIGEN_CONSTEXPR const bool is_modulo = false; + static EIGEN_CONSTEXPR const int digits = 11; + static EIGEN_CONSTEXPR const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static EIGEN_CONSTEXPR const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static EIGEN_CONSTEXPR const int radix = std::numeric_limits::radix; + static EIGEN_CONSTEXPR const int min_exponent = -13; + static EIGEN_CONSTEXPR const int min_exponent10 = -4; + static EIGEN_CONSTEXPR const int max_exponent = 16; + static EIGEN_CONSTEXPR const int max_exponent10 = 4; + static EIGEN_CONSTEXPR const bool traps = std::numeric_limits::traps; + // IEEE754: "The implementer shall choose how tininess is detected, but shall + // detect tininess in the same way for all operations in radix two" + static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits::tinyness_before; -namespace std { -template<> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } - static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } - static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } - static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } - static Eigen::half round_error() { return Eigen::half(0.5); } - static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } - static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } - static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } + static EIGEN_CONSTEXPR Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); } + static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static EIGEN_CONSTEXPR Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); } + static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); } + static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static EIGEN_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static EIGEN_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } + static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); } }; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::is_specialized; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::is_signed; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::is_integer; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::is_exact; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::has_infinity; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::has_quiet_NaN; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::has_signaling_NaN; +template +EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_half_impl::has_denorm; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::has_denorm_loss; +template +EIGEN_CONSTEXPR const std::float_round_style numeric_limits_half_impl::round_style; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::is_iec559; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::is_bounded; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::is_modulo; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::digits; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::digits10; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::max_digits10; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::radix; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::min_exponent; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::min_exponent10; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::max_exponent; +template +EIGEN_CONSTEXPR const int numeric_limits_half_impl::max_exponent10; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::traps; +template +EIGEN_CONSTEXPR const bool numeric_limits_half_impl::tinyness_before; +} // end namespace half_impl +} // end namespace Eigen + +namespace std { // If std::numeric_limits is specialized, should also specialize // std::numeric_limits, std::numeric_limits, and // std::numeric_limits // https://stackoverflow.com/a/16519653/ template<> -struct numeric_limits : numeric_limits {}; +class numeric_limits : public Eigen::half_impl::numeric_limits_half_impl<> {}; template<> -struct numeric_limits : numeric_limits {}; +class numeric_limits : public numeric_limits {}; template<> -struct numeric_limits : numeric_limits {}; -} // end namespace std +class numeric_limits : public numeric_limits {}; +template<> +class numeric_limits : public numeric_limits {}; +} // end namespace std namespace Eigen { @@ -261,7 +317,7 @@ namespace half_impl { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ EIGEN_CUDA_ARCH >= 530) || \ (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) -// Note: We deliberatly do *not* define this to 1 even if we have Arm's native +// Note: We deliberately do *not* define this to 1 even if we have Arm's native // fp16 type since GPU halfs are rather different from native CPU halfs. // TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16 #define EIGEN_HAS_NATIVE_FP16 @@ -334,7 +390,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { } #endif -#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); } @@ -534,7 +590,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #elif defined(EIGEN_HAS_FP16_C) __half_raw h; - h.x = _cvtss_sh(ff, 0); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0); + #else + h.x = _cvtss_sh(ff, 0); + #endif return h; #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) @@ -595,7 +656,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x))); + #else + return _cvtsh_ss(h.x); + #endif #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) return static_cast(h.x); #else @@ -692,6 +758,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { return half(::powf(float(a), float(b))); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan2(const half& a, const half& b) { + return half(::atan2f(float(a), float(b))); +} EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { return half(::sinf(float(a))); } diff --git a/libs/eigen/Eigen/src/Core/arch/Default/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/Default/TypeCasting.h index fb8183b..dc779a7 100644 --- a/libs/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/Default/TypeCasting.h @@ -11,13 +11,14 @@ #ifndef EIGEN_GENERIC_TYPE_CASTING_H #define EIGEN_GENERIC_TYPE_CASTING_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { template<> struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ @@ -36,7 +37,6 @@ struct functor_traits > template<> struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ @@ -55,7 +55,6 @@ struct functor_traits > template<> struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ @@ -74,7 +73,6 @@ struct functor_traits > template<> struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::bfloat16 result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const float& a) const { return Eigen::bfloat16(a); @@ -88,7 +86,6 @@ struct functor_traits > template<> struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::bfloat16 result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const int& a) const { return Eigen::bfloat16(static_cast(a)); @@ -102,7 +99,6 @@ struct functor_traits > template<> struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::bfloat16& a) const { return static_cast(a); diff --git a/libs/eigen/Eigen/src/Core/arch/CUDA/Complex.h b/libs/eigen/Eigen/src/Core/arch/GPU/Complex.h similarity index 95% rename from libs/eigen/Eigen/src/Core/arch/CUDA/Complex.h rename to libs/eigen/Eigen/src/Core/arch/GPU/Complex.h index deb4c86..c2b4c38 100644 --- a/libs/eigen/Eigen/src/Core/arch/CUDA/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/GPU/Complex.h @@ -8,18 +8,29 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_COMPLEX_CUDA_H -#define EIGEN_COMPLEX_CUDA_H +#ifndef EIGEN_COMPLEX_GPU_H +#define EIGEN_COMPLEX_GPU_H -// clang-format off // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, GCC and older versions of clang do // not treat them as device functions and thus Eigen functors making use of // these operators fail to compile. Here, we manually specialize these // operators and functors for complex types when building for CUDA to enable // their use on-device. +// +// NOTES: +// - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device, +// since they are already specialized in the standard. Using them will result +// in silent kernel failures. +// - Compiling with MSVC and using +=,-=,*=,/=(std::complex) will lead +// to duplicate definition errors, since these are already specialized in +// Visual Studio's header (contrary to the standard). This is +// preferable to removing such definitions, which will lead to silent kernel +// failures. +// - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior +// to the first inclusion of . -#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(EIGEN_GPUCC) && defined(EIGEN_GPU_COMPILE_PHASE) // ICC already specializes std::complex and std::complex // operators, preventing us from making them device functions here. @@ -43,6 +54,8 @@ using Eigen::complex_operator_detail::operator==; \ using Eigen::complex_operator_detail::operator!=; +#include "../../InternalHeaderCheck.h" + namespace Eigen { // Specialized std::complex overloads. @@ -253,6 +266,6 @@ EIGEN_USING_STD_COMPLEX_OPERATORS #endif // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_) -#endif // EIGEN_CUDACC && EIGEN_GPU_COMPILE_PHASE +#endif // EIGEN_GPUCC && EIGEN_GPU_COMPILE_PHASE -#endif // EIGEN_COMPLEX_CUDA_H +#endif // EIGEN_COMPLEX_GPU_H diff --git a/libs/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h index d2b3a25..ad61e95 100644 --- a/libs/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h @@ -10,6 +10,8 @@ #ifndef EIGEN_MATH_FUNCTIONS_GPU_H #define EIGEN_MATH_FUNCTIONS_GPU_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/arch/GPU/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/GPU/PacketMath.h index 689110d..e2bcf48 100644 --- a/libs/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/GPU/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_GPU_H #define EIGEN_PACKET_MATH_GPU_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -121,7 +123,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const do // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation // of the functions, while the latter can only deal with one of them. #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) -namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) { @@ -175,12 +176,21 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) { return __int_as_float(a < b ? 0xffffffffu : 0u); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) { return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); } -} // namespace +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, + const float& b) { + return __int_as_float(a <= b ? 0xffffffffu : 0u); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, + const double& b) { + return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull); +} template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, @@ -243,6 +253,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt(const float4& a, lt_mask(a.w, b.w)); } template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le(const float4& a, + const float4& b) { + return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), + le_mask(a.w, b.w)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq(const double2& a, const double2& b) { return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); @@ -252,6 +268,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt(const double2& a, const double2& b) { return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y)); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_le(const double2& a, const double2& b) { + return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y)); +} #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { @@ -493,9 +514,10 @@ ptranspose(PacketBlock& kernel) { #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) -// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning -// its corresponding packet_traits must be visible on host. -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +// Half-packet functions are not available on the host for CUDA 9.0-9.2, only +// on device. There is no benefit to using them on the host anyways, since they are +// emulated. +#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; @@ -526,42 +548,9 @@ template<> struct packet_traits : default_packet_traits }; }; -namespace { -// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __halves2half2(a, b); -#else - // Round-about way since __halves2half2 is a __device__ function. - return __floats2half2_rn(__half2float(a), __half2float(b)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __low2half(a); -#else - return __float2half(__low2float(a)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __high2half(a); -#else - return __float2half(__high2float(a)); -#endif -} -} // namespace - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if defined(EIGEN_GPU_COMPILE_PHASE) return __half2half2(from); -#else - const float f = __half2float(from); - return __floats2half2_rn(f, f); -#endif } template <> @@ -576,8 +565,6 @@ pset1(const Eigen::half& from) { return r; } -// We now need this visible on both host and device. -// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -585,11 +572,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return combine_half(from[0], from[1]); + return __halves2half2(from[0], from[1]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return combine_half(from[0], from[0]); + return __halves2half2(from[0], from[0]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, @@ -599,8 +586,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = get_half2_low(from); - to[1] = get_half2_high(from); + to[0] = __low2half(from); + to[1] = __high2half(from); } @@ -610,7 +597,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( // Input is guaranteed to be properly aligned. return __ldg(reinterpret_cast(from)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } @@ -619,31 +606,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( #if defined(EIGEN_GPU_HAS_LDG) return __halves2half2(__ldg(from+0), __ldg(from+1)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return combine_half(from[0*stride], from[1*stride]); + return __halves2half2(from[0*stride], from[1*stride]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = get_half2_low(from); - to[stride*1] = get_half2_high(from); + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return get_half2_low(a); + return __low2half(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); + half a1 = __low2half(a); + half a2 = __high2half(a); half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { @@ -658,12 +645,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - __half a1 = get_half2_low(kernel.packet[0]); - __half a2 = get_half2_high(kernel.packet[0]); - __half b1 = get_half2_low(kernel.packet[1]); - __half b2 = get_half2_high(kernel.packet[1]); - kernel.packet[0] = combine_half(a1, b1); - kernel.packet[1] = combine_half(a2, b2); + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { @@ -671,88 +658,101 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; - return combine_half(a, __float2half(f)); + return __halves2half2(a, __float2half(f)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) { - half mask_low = get_half2_low(mask); - half mask_high = get_half2_high(mask); - half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); - half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); - return combine_half(result_low, result_high); + half mask_low = __low2half(mask); + half mask_high = __high2half(mask); + half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); + half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); + return __halves2half2(result_low, result_high); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half; + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, @@ -851,9 +851,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, @@ -862,9 +862,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { @@ -885,7 +885,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 > a2 ? get_half2_low(a) : get_half2_high(a); + return a1 > a2 ? __low2half(a) : __high2half(a); #endif } @@ -897,7 +897,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 < a2 ? get_half2_low(a) : get_half2_high(a); + return a1 < a2 ? __low2half(a) : __high2half(a); #endif } @@ -1068,10 +1068,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather(const Eigen::half* from, Index stride) { Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); - p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); - p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); - p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); + p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]); + p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]); + p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]); + p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]); return r; } @@ -1152,12 +1152,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) { - __half a1 = get_half2_low(f0); - __half a2 = get_half2_high(f0); - __half b1 = get_half2_low(f1); - __half b2 = get_half2_high(f1); - f0 = combine_half(a1, b1); - f1 = combine_half(a2, b2); + __half a1 = __low2half(f0); + __half a2 = __high2half(f0); + __half b1 = __low2half(f1); + __half b2 = __high2half(f1); + f0 = __halves2half2(a1, b1); + f1 = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -1254,10 +1254,10 @@ plset(const Eigen::half& a) { float f = __half2float(a); Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(a, __float2half(f + 1.0f)); - p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); + p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); + p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); return r; #endif } @@ -1292,6 +1292,34 @@ pcmp_eq(const Packet4h2& a, const Packet4h2& b) { return r; } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pcmp_lt(const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]); + r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]); + r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]); + r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pcmp_le(const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pcmp_le(a_alias[0], b_alias[0]); + r_alias[1] = pcmp_le(a_alias[1], b_alias[1]); + r_alias[2] = pcmp_le(a_alias[2], b_alias[2]); + r_alias[3] = pcmp_le(a_alias[3], b_alias[3]); + return r; +} + template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand( const Packet4h2& a, const Packet4h2& b) { @@ -1477,9 +1505,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_max(a_alias[0]), + half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1])); - half2 m1 = combine_half(predux_max(a_alias[2]), + half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3])); __half first = predux_max(m0); __half second = predux_max(m1); @@ -1496,9 +1524,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_min(a_alias[0]), + half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1])); - half2 m1 = combine_half(predux_min(a_alias[2]), + half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3])); __half first = predux_min(m0); __half second = predux_min(m1); @@ -1652,9 +1680,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } template<> @@ -1664,14 +1692,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } -// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) - -#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) #undef EIGEN_GPU_HAS_LDG #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC diff --git a/libs/eigen/Eigen/src/Core/arch/GPU/Tuple.h b/libs/eigen/Eigen/src/Core/arch/GPU/Tuple.h new file mode 100644 index 0000000..e223ca1 --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/GPU/Tuple.h @@ -0,0 +1,302 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 The Eigen Team +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TUPLE_GPU +#define EIGEN_TUPLE_GPU + +#include +#include + +// This is a replacement of std::tuple that can be used in device code. + +namespace Eigen { +namespace internal { +namespace tuple_impl { + +// Internal tuple implementation. +template +class TupleImpl; + +// Generic recursive tuple. +template +class TupleImpl { + public: + // Tuple may contain Eigen types. + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + + // Default constructor, enable if all types are default-constructible. + template::value + && reduce_all::value...>::value + >> + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC + TupleImpl() : head_{}, tail_{} {} + + // Element constructor. + template 1 || std::is_convertible::value) + >> + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC + TupleImpl(U1&& arg1, Us&&... args) + : head_(std::forward(arg1)), tail_(std::forward(args)...) {} + + // The first stored value. + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + T1& head() { + return head_; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + const T1& head() const { + return head_; + } + + // The tail values. + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TupleImpl& tail() { + return tail_; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + const TupleImpl& tail() const { + return tail_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void swap(TupleImpl& other) { + using numext::swap; + swap(head_, other.head_); + swap(tail_, other.tail_); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TupleImpl& operator=(const TupleImpl& other) { + head_ = other.head_; + tail_ = other.tail_; + return *this; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TupleImpl& operator=(TupleImpl&& other) { + head_ = std::move(other.head_); + tail_ = std::move(other.tail_); + return *this; + } + + private: + // Allow related tuples to reference head_/tail_. + template + friend class TupleImpl; + + T1 head_; + TupleImpl tail_; +}; + +// Empty tuple specialization. +template<> +class TupleImpl {}; + +template +struct is_tuple : std::false_type {}; + +template +struct is_tuple< TupleImpl > : std::true_type {}; + +// Gets an element from a tuple. +template +struct tuple_get_impl { + using TupleType = TupleImpl; + using ReturnType = typename tuple_get_impl::ReturnType; + + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + ReturnType& run(TupleType& tuple) { + return tuple_get_impl::run(tuple.tail()); + } + + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + const ReturnType& run(const TupleType& tuple) { + return tuple_get_impl::run(tuple.tail()); + } +}; + +// Base case, getting the head element. +template +struct tuple_get_impl<0, T1, Ts...> { + using TupleType = TupleImpl; + using ReturnType = T1; + + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + T1& run(TupleType& tuple) { + return tuple.head(); + } + + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + const T1& run(const TupleType& tuple) { + return tuple.head(); + } +}; + +// Concatenates N Tuples. +template +struct tuple_cat_impl; + +template +struct tuple_cat_impl, TupleImpl, Tuples...> { + using TupleType1 = TupleImpl; + using TupleType2 = TupleImpl; + using MergedTupleType = TupleImpl; + + using ReturnType = typename tuple_cat_impl::ReturnType; + + // Uses the index sequences to extract and merge elements from tuple1 and tuple2, + // then recursively calls again. + template + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ReturnType run(Tuple1&& tuple1, std::index_sequence, + Tuple2&& tuple2, std::index_sequence, + MoreTuples&&... tuples) { + return tuple_cat_impl::run( + MergedTupleType(tuple_get_impl::run(std::forward(tuple1))..., + tuple_get_impl::run(std::forward(tuple2))...), + std::forward(tuples)...); + } + + // Concatenates the first two tuples. + template + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2, MoreTuples&&... tuples) { + return run(std::forward(tuple1), std::make_index_sequence{}, + std::forward(tuple2), std::make_index_sequence{}, + std::forward(tuples)...); + } +}; + +// Base case with a single tuple. +template +struct tuple_cat_impl<1, TupleImpl > { + using ReturnType = TupleImpl; + + template + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ReturnType run(Tuple1&& tuple1) { + return tuple1; + } +}; + +// Special case of no tuples. +template<> +struct tuple_cat_impl<0> { + using ReturnType = TupleImpl<0>; + static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ReturnType run() {return ReturnType{}; } +}; + +// For use in make_tuple, unwraps a reference_wrapper. +template +struct unwrap_reference_wrapper { using type = T; }; + +template +struct unwrap_reference_wrapper > { using type = T&; }; + +// For use in make_tuple, decays a type and unwraps a reference_wrapper. +template +struct unwrap_decay { + using type = typename unwrap_reference_wrapper::type>::type; +}; + +/** + * Utility for determining a tuple's size. + */ +template +struct tuple_size; + +template +struct tuple_size< TupleImpl > : std::integral_constant {}; + +/** + * Gets an element of a tuple. + * \tparam Idx index of the element. + * \tparam Types ... tuple element types. + * \param tuple the tuple. + * \return a reference to the desired element. + */ +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename tuple_get_impl::ReturnType& +get(const TupleImpl& tuple) { + return tuple_get_impl::run(tuple); +} + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename tuple_get_impl::ReturnType& +get(TupleImpl& tuple) { + return tuple_get_impl::run(tuple); +} + +/** + * Concatenate multiple tuples. + * \param tuples ... list of tuples. + * \return concatenated tuple. + */ +template::type>::value...>::value>> +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename tuple_cat_impl::type...>::ReturnType +tuple_cat(Tuples&&... tuples) { + return tuple_cat_impl::type...>::run(std::forward(tuples)...); +} + +/** + * Tie arguments together into a tuple. + */ +template > +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ReturnType tie(Args&... args) EIGEN_NOEXCEPT { + return ReturnType{args...}; +} + +/** + * Create a tuple of l-values with the supplied arguments. + */ +template ::type...> > +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ReturnType make_tuple(Args&&... args) { + return ReturnType{std::forward(args)...}; +} + +/** + * Forward a set of arguments as a tuple. + */ +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +TupleImpl forward_as_tuple(Args&&... args) { + return TupleImpl(std::forward(args)...); +} + +/** + * Alternative to std::tuple that can be used on device. + */ +template +using tuple = TupleImpl; + +} // namespace tuple_impl +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TUPLE_GPU diff --git a/libs/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h index 7545462..6e8ba27 100644 --- a/libs/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -10,13 +10,14 @@ #ifndef EIGEN_TYPE_CASTING_GPU_H #define EIGEN_TYPE_CASTING_GPU_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { diff --git a/libs/eigen/Eigen/src/Core/arch/MSA/Complex.h b/libs/eigen/Eigen/src/Core/arch/MSA/Complex.h index 53dacfa..b11a9b4 100644 --- a/libs/eigen/Eigen/src/Core/arch/MSA/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/MSA/Complex.h @@ -15,6 +15,8 @@ #include +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -75,15 +77,12 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; } - EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { - *this *= b.conjugate(); - Packet4f s = pmul(b.v, b.v); - s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - v = pdiv(v, s); - return *this; - } EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { - return Packet2cf(*this) /= b; + return pdiv_complex(Packet2cf(*this), b); + } + EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { + *this = Packet2cf(*this) / b; + return *this; } EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(pnegate(v)); diff --git a/libs/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h index f5181b9..5932041 100644 --- a/libs/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h @@ -26,27 +26,29 @@ #ifndef EIGEN_MATH_FUNCTIONS_MSA_H #define EIGEN_MATH_FUNCTIONS_MSA_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog(const Packet4f& _x) { - static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); // Convert negative argument into NAN (quiet negative, to be specific). Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0); @@ -119,23 +121,23 @@ plog(const Packet4f& _x) { } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp(const Packet4f& _x) { // Limiting single-precision pexp's argument to [-128, +128] lets pexp // reach 0 and INFINITY naturally. - static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f); - static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f); + static EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f); + static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); Packet4f x = _x; @@ -172,23 +174,23 @@ pexp(const Packet4f& _x) { } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh(const Packet4f& _x) { - static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f); + static EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f); + static EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f); // The monomial coefficients of the numerator polynomial (odd). - static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); + static EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f); + static EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f); + static EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f); + static EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f); + static EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + static EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + static EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); // The monomial coefficients of the denominator polynomial (even). - static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f); + static EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f); + static EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f); + static EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f); + static EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f); Packet4f x = pabs(_x); Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny); @@ -229,19 +231,19 @@ ptanh(const Packet4f& _x) { template Packet4f psincos_inner_msa_float(const Packet4f& _x) { - static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi). - static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f); - static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f); - static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi. - static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + static EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi). + static EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f); + static EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + static EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + static EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f); + static EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f); + static EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f); + static EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f); + static EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f); + static EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f); + static EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi. + static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); Packet4f x = pabs(_x); @@ -310,37 +312,37 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) { } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psin(const Packet4f& x) { return psincos_inner_msa_float(x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos(const Packet4f& x) { return psincos_inner_msa_float(x); } template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& _x) { // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp // reach 0 and INFINITY naturally. - static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0); - static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0); - static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0); + static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0); + static EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0); + static EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + static EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + static EIGEN_DECLARE_CONST_Packet2d(1, 1.0); + static EIGEN_DECLARE_CONST_Packet2d(2, 2.0); Packet2d x = _x; diff --git a/libs/eigen/Eigen/src/Core/arch/MSA/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/MSA/PacketMath.h index afe8f33..f03dbed 100644 --- a/libs/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/MSA/PacketMath.h @@ -16,6 +16,8 @@ #include #include +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -51,9 +53,9 @@ typedef v4f32 Packet4f; typedef v4i32 Packet4i; typedef v4u32 Packet4ui; -#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } -#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } -#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } +#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } +#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } +#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) { os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; @@ -821,9 +823,9 @@ typedef v2f64 Packet2d; typedef v2i64 Packet2l; typedef v2u64 Packet2ul; -#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } -#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } -#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } +#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } +#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } +#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) { os << "[ " << value[0] << ", " << value[1] << " ]"; diff --git a/libs/eigen/Eigen/src/Core/arch/NEON/Complex.h b/libs/eigen/Eigen/src/Core/arch/NEON/Complex.h index f40af7f..008dd7a 100644 --- a/libs/eigen/Eigen/src/Core/arch/NEON/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/NEON/Complex.h @@ -11,6 +11,8 @@ #ifndef EIGEN_COMPLEX_NEON_H #define EIGEN_COMPLEX_NEON_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -347,27 +349,11 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) { - // TODO optimize it for NEON - Packet1cf res = pmul(a, pconj(b)); - Packet2f s, rev_s; - - // this computes the norm - s = vmul_f32(b.v, b.v); - rev_s = vrev64_f32(s); - - return Packet1cf(pdiv(res.v, vadd_f32(s, rev_s))); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for NEON - Packet2cf res = pmul(a,pconj(b)); - Packet4f s, rev_s; - - // this computes the norm - s = vmulq_f32(b.v, b.v); - rev_s = vrev64q_f32(s); - - return Packet2cf(pdiv(res.v, vaddq_f32(s, rev_s))); + return pdiv_complex(a, b); } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) {} @@ -390,7 +376,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CPE static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; #else const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; @@ -553,12 +539,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for NEON - Packet1cd res = pmul(a,pconj(b)); - Packet2d s = pmul(b.v, b.v); - Packet2d rev_s = preverse(s); - - return Packet1cd(pdiv(res.v, padd(s,rev_s))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) diff --git a/libs/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/libs/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h index 3481f33..b97a090 100644 --- a/libs/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +++ b/libs/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -1,6 +1,8 @@ +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { - + #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. @@ -41,15 +43,18 @@ struct gebp_traits #if EIGEN_ARCH_ARM64 +#ifndef EIGEN_NEON_GEBP_NR +#define EIGEN_NEON_GEBP_NR 8 +#endif + template<> struct gebp_traits : gebp_traits { typedef float RhsPacket; typedef float32x4_t RhsPacketx4; - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const - { + enum { nr = EIGEN_NEON_GEBP_NR }; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; } @@ -75,7 +80,6 @@ struct gebp_traits { c = vfmaq_n_f32(c, a, b); } - // NOTE: Template parameter inference failed when compiled with Android NDK: // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". @@ -92,9 +96,10 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { - #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // vfmaq_laneq_f32 is implemented through a costly dup + #if EIGEN_COMP_GNUC_STRICT + // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9 + // 2. workaround the gcc register split problem on arm64-neon if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); @@ -111,7 +116,7 @@ struct gebp_traits : gebp_traits { typedef double RhsPacket; - + enum { nr = EIGEN_NEON_GEBP_NR }; struct RhsPacketx4 { float64x2_t B_0, B_1; }; @@ -161,9 +166,10 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { - #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // vfmaq_laneq_f64 is implemented through a costly dup + #if EIGEN_COMP_GNUC_STRICT + // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9 + // 2. workaround the gcc register split problem on arm64-neon if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); @@ -177,6 +183,73 @@ struct gebp_traits } }; +// The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not +// allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]', +// so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented +// through a costly dup in gcc compiler. +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG + +template<> +struct gebp_traits + : gebp_traits +{ + typedef half RhsPacket; + typedef float16x4_t RhsPacketx4; + typedef float16x4_t PacketHalf; + enum { nr = EIGEN_NEON_GEBP_NR }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1_f16((const __fp16 *)b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS + // using a single scalar value. + eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS."); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f16(c, a, b); + } + EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfma_n_f16(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + c = vfmaq_lane_f16(c, a, b, LaneID); + } +}; +#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG #endif // EIGEN_ARCH_ARM64 } // namespace internal diff --git a/libs/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h index fa6615a..aea5149 100644 --- a/libs/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -8,36 +8,72 @@ #ifndef EIGEN_MATH_FUNCTIONS_NEON_H #define EIGEN_MATH_FUNCTIONS_NEON_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp(const Packet2f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pexp(const Packet2f& x) { return pexp_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp(const Packet4f& x) { return pexp_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog(const Packet2f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f plog(const Packet2f& x) { return plog_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog(const Packet4f& x) { return plog_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin(const Packet2f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f psin(const Packet2f& x) { return psin_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psin(const Packet4f& x) { return psin_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos(const Packet2f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pcos(const Packet2f& x) { return pcos_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos(const Packet4f& x) { return pcos_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pacos(const Packet2f& x) +{ return pacos_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pacos(const Packet4f& x) +{ return pacos_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pasin(const Packet2f& x) +{ return pasin_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pasin(const Packet4f& x) +{ return pasin_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f patan(const Packet2f& x) +{ return patan_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f patan(const Packet4f& x) +{ return patan_float(x); } + // Hyperbolic Tangent function. -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh(const Packet2f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f ptanh(const Packet2f& x) { return internal::generic_fast_tanh_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh(const Packet4f& x) { return internal::generic_fast_tanh_float(x); } +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet4hf ptanh(const Packet4hf& x) { + // Convert to float, call the float ptanh, and then convert back. + return vcvt_f16_f32(ptanh(vcvt_f32_f16(x))); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet8hf ptanh(const Packet8hf& x) { + // Convert each 4 halfs to float, call the float ptanh, and then convert back. + return vcombine_f16( + vcvt_f16_f32(ptanh(vcvt_f32_f16(vget_low_f16(x)))), + vcvt_f16_f32(ptanh(vcvt_high_f32_f16(x)))); +} +#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + + BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin) BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos) BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog) @@ -60,12 +96,15 @@ EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& expone //---------- double ---------- #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& x) { return pexp_double(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d plog(const Packet2d& x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d plog(const Packet2d& x) { return plog_double(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d patan(const Packet2d& x) +{ return patan_double(x); } + #endif } // end namespace internal diff --git a/libs/eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/NEON/PacketMath.h index d2aeef4..8dd288b 100644 --- a/libs/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/NEON/PacketMath.h @@ -12,6 +12,8 @@ #ifndef EIGEN_PACKET_MATH_NEON_H #define EIGEN_PACKET_MATH_NEON_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -137,13 +139,13 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define vec4f_duplane(a, p) \ vdupq_lane_f32(vget_low_f32(a), p) -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1(X)) -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) #if EIGEN_ARCH_ARM64 @@ -155,7 +157,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif EIGEN_ARCH_ARM32 +#elif EIGEN_ARCH_ARM #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching @@ -196,6 +198,9 @@ struct packet_traits : default_packet_traits HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, + HasACos = 1, + HasASin = 1, + HasATan = 1, HasLog = 1, HasExp = 1, HasSqrt = 1, @@ -219,6 +224,7 @@ struct packet_traits : default_packet_traits size = 16, HasHalfPacket = 1, + HasCmp = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -248,6 +254,7 @@ struct packet_traits : default_packet_traits size = 16, HasHalfPacket = 1, + HasCmp = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -313,7 +320,7 @@ struct packet_traits : default_packet_traits HasShift = 1, HasMul = 1, HasNegate = 0, - HasAbs = 0, + HasAbs = 1, HasAbsDiff = 1, HasArg = 0, HasAbs2 = 1, @@ -372,7 +379,7 @@ struct packet_traits : default_packet_traits HasShift = 1, HasMul = 1, HasNegate = 0, - HasAbs = 0, + HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasAbsDiff = 1, @@ -434,7 +441,7 @@ struct packet_traits : default_packet_traits HasShift = 1, HasMul = 1, HasNegate = 0, - HasAbs = 0, + HasAbs = 1, HasArg = 0, HasAbs2 = 1, HasAbsDiff = 1, @@ -446,15 +453,6 @@ struct packet_traits : default_packet_traits }; }; -#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM -// workaround gcc 4.2, 4.3 and 4.4 compilation issue -EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } -EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } -#endif - template<> struct unpacket_traits { typedef float type; @@ -712,9 +710,9 @@ template<> EIGEN_STRONG_INLINE Packet4ui pset1(const uint32_t& from) template<> EIGEN_STRONG_INLINE Packet2l pset1(const int64_t& from) { return vdupq_n_s64(from); } template<> EIGEN_STRONG_INLINE Packet2ul pset1(const uint64_t& from) { return vdupq_n_u64(from); } -template<> EIGEN_STRONG_INLINE Packet2f pset1frombits(unsigned int from) +template<> EIGEN_STRONG_INLINE Packet2f pset1frombits(uint32_t from) { return vreinterpret_f32_u32(vdup_n_u32(from)); } -template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(uint32_t from) { return vreinterpretq_f32_u32(vdupq_n_u32(from)); } template<> EIGEN_STRONG_INLINE Packet2f plset(const float& a) @@ -2374,6 +2372,15 @@ template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { } template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; } +template <> +EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) { + return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31)); +} +template <> +EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { + return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31)); +} + template<> EIGEN_STRONG_INLINE Packet2f pfrexp(const Packet2f& a, Packet2f& exponent) { return pfrexp_generic(a,exponent); } template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) @@ -2384,12 +2391,17 @@ template<> EIGEN_STRONG_INLINE Packet2f pldexp(const Packet2f& a, cons template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_generic(a,exponent); } +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE float predux(const Packet2f& a) { return vaddv_f32(a); } +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { return vaddvq_f32(a); } +#else template<> EIGEN_STRONG_INLINE float predux(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a)); return vget_lane_f32(vpadd_f32(sum, sum), 0); } +#endif template<> EIGEN_STRONG_INLINE int8_t predux(const Packet4c& a) { const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); @@ -2397,6 +2409,10 @@ template<> EIGEN_STRONG_INLINE int8_t predux(const Packet4c& a) sum = vpadd_s8(sum, sum); return vget_lane_s8(sum, 0); } +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet8c& a) { return vaddv_s8(a); } +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) { return vaddvq_s8(a); } +#else template<> EIGEN_STRONG_INLINE int8_t predux(const Packet8c& a) { int8x8_t sum = vpadd_s8(a,a); @@ -2412,6 +2428,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) sum = vpadd_s8(sum, sum); return vget_lane_s8(sum, 0); } +#endif template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet4uc& a) { const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); @@ -2419,6 +2436,20 @@ template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet4uc& a) sum = vpadd_u8(sum, sum); return vget_lane_u8(sum, 0); } +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet8uc& a) { return vaddv_u8(a); } +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) { return vaddvq_u8(a); } +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet4s& a) { return vaddv_s16(a); } +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet8s& a) { return vaddvq_s16(a); } +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet4us& a) { return vaddv_u16(a); } +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet8us& a) { return vaddvq_u16(a); } +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet2i& a) { return vaddv_s32(a); } +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { return vaddvq_s32(a); } +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet2ui& a) { return vaddv_u32(a); } +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) { return vaddvq_u32(a); } +template<> EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) { return vaddvq_s64(a); } +template<> EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) { return vaddvq_u64(a); } +#else template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet8uc& a) { uint8x8_t sum = vpadd_u8(a,a); @@ -2474,6 +2505,7 @@ template<> EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) { return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); } template<> EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) { return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); } +#endif template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) { @@ -2574,6 +2606,10 @@ template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a { return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); } // min +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE float predux_min(const Packet2f& a) { return vminv_f32(a); } +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { return vminvq_f32(a); } +#else template<> EIGEN_STRONG_INLINE float predux_min(const Packet2f& a) { return vget_lane_f32(vpmin_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) @@ -2581,6 +2617,7 @@ template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a)); return vget_lane_f32(vpmin_f32(min, min), 0); } +#endif template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet4c& a) { const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); @@ -2588,6 +2625,10 @@ template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet4c& a) min = vpmin_s8(min, min); return vget_lane_s8(min, 0); } +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet8c& a) { return vminv_s8(a); } +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) { return vminvq_s8(a); } +#else template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet8c& a) { int8x8_t min = vpmin_s8(a,a); @@ -2603,6 +2644,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) min = vpmin_s8(min, min); return vget_lane_s8(min, 0); } +#endif template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet4uc& a) { const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); @@ -2610,6 +2652,18 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet4uc& a) min = vpmin_u8(min, min); return vget_lane_u8(min, 0); } +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet8uc& a) { return vminv_u8(a); } +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& a) { return vminvq_u8(a); } +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet4s& a) { return vminv_s16(a); } +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet8s& a) { return vminvq_s16(a); } +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet4us& a) { return vminv_u16(a); } +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet8us& a) { return vminvq_u16(a); } +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet2i& a) { return vminv_s32(a); } +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { return vminvq_s32(a); } +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet2ui& a) { return vminv_u32(a); } +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) { return vminvq_u32(a); } +#else template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet8uc& a) { uint8x8_t min = vpmin_u8(a,a); @@ -2663,12 +2717,17 @@ template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a)); return vget_lane_u32(vpmin_u32(min, min), 0); } +#endif template<> EIGEN_STRONG_INLINE int64_t predux_min(const Packet2l& a) { return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } template<> EIGEN_STRONG_INLINE uint64_t predux_min(const Packet2ul& a) { return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } // max +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE float predux_max(const Packet2f& a) { return vmaxv_f32(a); } +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { return vmaxvq_f32(a); } +#else template<> EIGEN_STRONG_INLINE float predux_max(const Packet2f& a) { return vget_lane_f32(vpmax_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) @@ -2676,6 +2735,7 @@ template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a)); return vget_lane_f32(vpmax_f32(max, max), 0); } +#endif template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet4c& a) { const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); @@ -2683,6 +2743,10 @@ template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet4c& a) max = vpmax_s8(max, max); return vget_lane_s8(max, 0); } +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet8c& a) { return vmaxv_s8(a); } +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) { return vmaxvq_s8(a); } +#else template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet8c& a) { int8x8_t max = vpmax_s8(a,a); @@ -2698,6 +2762,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) max = vpmax_s8(max, max); return vget_lane_s8(max, 0); } +#endif template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet4uc& a) { const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); @@ -2705,6 +2770,18 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet4uc& a) max = vpmax_u8(max, max); return vget_lane_u8(max, 0); } +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet8uc& a) { return vmaxv_u8(a); } +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet16uc& a) { return vmaxvq_u8(a); } +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet4s& a) { return vmaxv_s16(a); } +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet8s& a) { return vmaxvq_s16(a); } +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet4us& a) { return vmaxv_u16(a); } +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet8us& a) { return vmaxvq_u16(a); } +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet2i& a) { return vmaxv_s32(a); } +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { return vmaxvq_s32(a); } +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet2ui& a) { return vmaxv_u32(a); } +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) { return vmaxvq_u32(a); } +#else template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet8uc& a) { uint8x8_t max = vpmax_u8(a,a); @@ -2758,6 +2835,7 @@ template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a)); return vget_lane_u32(vpmax_u32(max, max), 0); } +#endif template<> EIGEN_STRONG_INLINE int64_t predux_max(const Packet2l& a) { return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } template<> EIGEN_STRONG_INLINE uint64_t predux_max(const Packet2ul& a) @@ -3274,23 +3352,13 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { } template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { - // Compute approximate reciprocal sqrt. - Packet4f x = vrsqrteq_f32(a); // Do Newton iterations for 1/sqrt(x). - x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); - x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); - const Packet4f infinity = pset1(NumTraits::infinity()); - return pselect(pcmp_eq(a, pzero(a)), infinity, x); + return generic_rsqrt_newton_step::run(a, vrsqrteq_f32(a)); } template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) { // Compute approximate reciprocal sqrt. - Packet2f x = vrsqrte_f32(a); - // Do Newton iterations for 1/sqrt(x). - x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); - x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); - const Packet2f infinity = pset1(NumTraits::infinity()); - return pselect(pcmp_eq(a, pzero(a)), infinity, x); + return generic_rsqrt_newton_step::run(a, vrsqrte_f32(a)); } // Unfortunately vsqrt_f32 is only available for A64. @@ -3299,14 +3367,10 @@ template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_ template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); } #else template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { - const Packet4f infinity = pset1(NumTraits::infinity()); - const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); - return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); + return generic_sqrt_newton_step::run(a, prsqrt(a)); } template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { - const Packet2f infinity = pset1(NumTraits::infinity()); - const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); - return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); + return generic_sqrt_newton_step::run(a, prsqrt(a)); } #endif @@ -3386,7 +3450,7 @@ EIGEN_ALWAYS_INLINE void zip_in_place(Packet4bf& p1, Packet4bf& p2) { EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) { - // See the scalar implemention in BFloat16.h for a comprehensible explanation + // See the scalar implementation in BFloat16.h for a comprehensible explanation // of this fast rounding algorithm Packet4ui input = reinterpret_cast(p); @@ -3707,10 +3771,13 @@ template<> struct packet_traits : default_packet_traits HasCeil = 1, HasRint = 1, +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG + HasExp = 1, + HasLog = 1, + HasATan = 1, +#endif HasSin = 0, HasCos = 0, - HasLog = 1, - HasExp = 1, HasSqrt = 1, HasRsqrt = 1, HasTanh = 0, @@ -3846,14 +3913,13 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } -#if EIGEN_COMP_CLANG && defined(__apple_build_version__) -// workaround ICE, see bug 907 +template <> +EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { + return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63)); +} + template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ return (vget_low_f64(a) + vget_high_f64(a))[0]; } -#else -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } -#endif +{ return vaddvq_f64(a); } // Other reduction functions: // mul @@ -3867,11 +3933,11 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) // min template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) -{ return vgetq_lane_f64(vpminq_f64(a,a), 0); } +{ return vminvq_f64(a); } // max template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) -{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); } +{ return vmaxvq_f64(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -3906,20 +3972,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) { return vreinterpretq_f64_u64(vdupq_n_u64(from)); } template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { - // Compute approximate reciprocal sqrt. - Packet2d x = vrsqrteq_f64(a); // Do Newton iterations for 1/sqrt(x). - x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); - x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); - x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); - const Packet2d infinity = pset1(NumTraits::infinity()); - return pselect(pcmp_eq(a, pzero(a)), infinity, x); + return generic_rsqrt_newton_step::run(a, vrsqrteq_f64(a)); } template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } -#endif // EIGEN_ARCH_ARM64 - // Do we have an fp16 types and supporting Neon intrinsics? #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC typedef float16x4_t Packet4hf; @@ -3961,6 +4019,7 @@ struct packet_traits : default_packet_traits { HasCos = 0, HasLog = 0, HasExp = 0, + HasTanh = packet_traits::HasTanh, // tanh calls tanh HasSqrt = 1, HasRsqrt = 1, HasErf = EIGEN_FAST_MATH, @@ -4420,11 +4479,21 @@ EIGEN_STRONG_INLINE Packet8hf pabs(const Packet8hf& a) { return vabsq_f16(a); } +template<> +EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) { + return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15)); +} + template <> EIGEN_STRONG_INLINE Packet4hf pabs(const Packet4hf& a) { return vabs_f16(a); } +template <> +EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) { + return vreinterpret_f16_s16( vshr_n_s16( vreinterpret_s16_f16(a), 15)); +} + template <> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8hf& a) { float16x4_t a_lo, a_hi, sum; @@ -4476,51 +4545,29 @@ EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet4hf& a) { template <> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8hf& a) { - float16x4_t a_lo, a_hi, min; - - a_lo = vget_low_f16(a); - a_hi = vget_high_f16(a); - min = vpmin_f16(a_lo, a_hi); - min = vpmin_f16(min, min); - min = vpmin_f16(min, min); - Eigen::half h; - h.x = vget_lane_f16(min, 0); + h.x = vminvq_f16(a); return h; } template <> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet4hf& a) { - Packet4hf tmp; - tmp = vpmin_f16(a, a); - tmp = vpmin_f16(tmp, tmp); Eigen::half h; - h.x = vget_lane_f16(tmp, 0); + h.x = vminv_f16(a); return h; } template <> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8hf& a) { - float16x4_t a_lo, a_hi, max; - - a_lo = vget_low_f16(a); - a_hi = vget_high_f16(a); - max = vpmax_f16(a_lo, a_hi); - max = vpmax_f16(max, max); - max = vpmax_f16(max, max); - Eigen::half h; - h.x = vget_lane_f16(max, 0); + h.x = vmaxvq_f16(a); return h; } template <> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet4hf& a) { - Packet4hf tmp; - tmp = vpmax_f16(a, a); - tmp = vpmax_f16(tmp, tmp); Eigen::half h; - h.x = vget_lane_f16(tmp, 0); + h.x = vmaxv_f16(a); return h; } @@ -4580,6 +4627,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& } #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +#endif // EIGEN_ARCH_ARM64 + } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h index 54f9733..e5ddab6 100644 --- a/libs/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -11,6 +11,8 @@ #ifndef EIGEN_TYPE_CASTING_NEON_H #define EIGEN_TYPE_CASTING_NEON_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h b/libs/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h new file mode 100644 index 0000000..67f9dcf --- /dev/null +++ b/libs/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h @@ -0,0 +1,63 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_NEON_UNARY_FUNCTORS_H +#define EIGEN_NEON_UNARY_FUNCTORS_H + +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +/** \internal + * \brief Template specialization of the logistic function for Eigen::half. + */ +template <> +struct scalar_logistic_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Eigen::half operator()(const Eigen::half& x) const { + // Convert to float and call scalar_logistic_op. + const scalar_logistic_op float_op; + return Eigen::half(float_op(float(x))); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Eigen::half packetOp(const Eigen::half& x) const { + return this->operator()(x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet4hf packetOp(const Packet4hf& x) const { + const scalar_logistic_op float_op; + return vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(x))); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet8hf packetOp(const Packet8hf& x) const { + const scalar_logistic_op float_op; + return vcombine_f16( + vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(vget_low_f16(x)))), + vcvt_f16_f32(float_op.packetOp(vcvt_high_f32_f16(x)))); + } +}; + +template<> +struct functor_traits> { + enum { + Cost = functor_traits>::Cost, + PacketAccess = functor_traits>::PacketAccess, + }; +}; +#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_NEON_UNARY_FUNCTORS_H diff --git a/libs/eigen/Eigen/src/Core/arch/SSE/Complex.h b/libs/eigen/Eigen/src/Core/arch/SSE/Complex.h index 8fe22da..60308ce 100644 --- a/libs/eigen/Eigen/src/Core/arch/SSE/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/SSE/Complex.h @@ -10,6 +10,8 @@ #ifndef EIGEN_COMPLEX_SSE_H #define EIGEN_COMPLEX_SSE_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -106,14 +108,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - Packet2cf res; -#ifdef EIGEN_VECTORIZE_SSE3 - res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast(&from))); -#else - res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&from))); - res.v = _mm_movelh_ps(res.v, res.v); -#endif - return res; + const float re = std::real(from); + const float im = std::imag(from); + return Packet2cf(_mm_set_ps(im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } @@ -140,17 +137,9 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - #if EIGEN_GNUC_AT_MOST(4,3) - // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares... - // This workaround also fix invalid code generation with gcc 4.3 - EIGEN_ALIGN16 std::complex res[2]; - _mm_store_ps((float*)res, a.v); - return res[0]; - #else - std::complex res; + alignas(alignof(__m64)) std::complex res; _mm_storel_pi((__m64*)&res, a.v); return res; - #endif } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } @@ -174,14 +163,9 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for SSE3 and 4 - Packet2cf res = pmul(a, pconj(b)); - __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2)))); + return pdiv_complex(a, b); } - - //---------- double ---------- struct Packet1cd { @@ -299,10 +283,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for SSE3 and 4 - Packet1cd res = pmul(a,pconj(b)); - __m128d s = _mm_mul_pd(b.v,b.v); - return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/* */(const Packet1cd& x) diff --git a/libs/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h index 8736d0d..f98fb7a 100644 --- a/libs/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -15,155 +15,123 @@ #ifndef EIGEN_MATH_FUNCTIONS_SSE_H #define EIGEN_MATH_FUNCTIONS_SSE_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog(const Packet4f& _x) { return plog_float(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d plog(const Packet2d& _x) { return plog_double(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog2(const Packet4f& _x) { return plog2_float(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d plog2(const Packet2d& _x) { return plog2_double(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog1p(const Packet4f& _x) { return generic_plog1p(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexpm1(const Packet4f& _x) { return generic_expm1(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp(const Packet4f& _x) { return pexp_float(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& x) { return pexp_double(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psin(const Packet4f& _x) { return psin_float(_x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos(const Packet4f& _x) { return pcos_float(_x); } -#if EIGEN_FAST_MATH - -// Functions for sqrt. -// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step -// of Newton's method, at a cost of 1-2 bits of precision as opposed to the -// exact solution. It does not handle +inf, or denormalized numbers correctly. -// The main advantage of this approach is not just speed, but also the fact that -// it can be inlined and pipelined with other computations, further reducing its -// effective latency. This is similar to Quake3's fast inverse square root. -// For detail see here: http://www.beyond3d.com/content/articles/8/ -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psqrt(const Packet4f& _x) +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f pacos(const Packet4f& _x) { - Packet4f minus_half_x = pmul(_x, pset1(-0.5f)); - Packet4f denormal_mask = pandnot( - pcmp_lt(_x, pset1((std::numeric_limits::min)())), - pcmp_lt(_x, pzero(_x))); - - // Compute approximate reciprocal sqrt. - Packet4f x = _mm_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); - // Flush results for denormals to zero. - return pandnot(pmul(_x,x), denormal_mask); + return pacos_float(_x); } -#else +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet2d patan(const Packet2d& _x) { + return patan_double(_x); +} -template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f pasin(const Packet4f& _x) +{ + return pasin_float(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f patan(const Packet4f& _x) +{ + return patan_float(_x); +} + +// Notice that for newer processors, it is counterproductive to use Newton +// iteration for square root. In particular, Skylake and Zen2 processors +// have approximately doubled throughput of the _mm_sqrt_ps instruction +// compared to their predecessors. +template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } - -#endif - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16b psqrt(const Packet16b& x) { return x; } #if EIGEN_FAST_MATH - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); - - Packet4f neg_half = pmul(_x, p4f_minus_half); - - // Identity infinite, zero, negative and denormal arguments. - Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min); - Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf); - Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask); - - // Compute an approximate result using the rsqrt intrinsic. - Packet4f y_approx = _mm_rsqrt_ps(_x); - - // Do a single step of Newton-Raphson iteration to improve the approximation. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet4f y_newton = pmul( - y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five)); - - // Select the result of the Newton-Raphson step for positive normal arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if - // x is zero or a positive denormalized float (equivalent to flushing positive - // denormalized inputs to zero). - return pselect(not_normal_finite_mask, y_approx, y_newton); -} - -#else - +// Even on Skylake, using Newton iteration is a win for reciprocal square root. template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { - // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation. - return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); + return generic_rsqrt_newton_step::run(x, _mm_rsqrt_ps(x)); } +#ifdef EIGEN_VECTORIZE_FMA +// Trying to speed up reciprocal using Newton-Raphson is counterproductive +// unless FMA is available. Without FMA pdiv(pset1(Scalar(1),a)) is +// 30% faster. +template<> EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& x) { + return generic_reciprocal_newton_step::run(x, _mm_rcp_ps(x)); +} +#endif + #endif -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d prsqrt(const Packet2d& x) { - return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); -} + // Hyperbolic Tangent function. template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh(const Packet4f& x) { return internal::generic_fast_tanh_float(x); } diff --git a/libs/eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100755 new mode 100644 index db102c7..a0ff359 --- a/libs/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/SSE/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_SSE_H #define EIGEN_PACKET_MATH_SSE_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -30,7 +32,7 @@ namespace internal { #endif #endif -#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX +#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot // have overloads for both types without linking error. // One solution is to increase ABI version using -fabi-version=4 (or greater). @@ -106,16 +108,16 @@ EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b #define vec2d_duplane(a,p) \ vec2d_swizzle2(a,a,(p<<1)|p) -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ const Packet2d p2d_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ const Packet4f p4f_##NAME = pset1frombits(X) -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -134,8 +136,12 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, + HasReciprocal = EIGEN_FAST_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, + HasACos = 1, + HasASin = 1, + HasATan = 1, HasLog = 1, HasLog1p = 1, HasExpm1 = 1, @@ -152,7 +158,8 @@ struct packet_traits : default_packet_traits { #ifdef EIGEN_VECTORIZE_SSE4_1 HasRound = 1, #endif - HasRint = 1 + HasRint = 1, + HasSign = 0 // The manually vectorized version is slightly slower for SSE. }; }; template <> @@ -171,6 +178,7 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasATan = 1, HasBlend = 1, HasFloor = 1, HasCeil = 1, @@ -180,7 +188,6 @@ struct packet_traits : default_packet_traits { HasRint = 1 }; }; -#endif template<> struct packet_traits : default_packet_traits { typedef Packet4i type; @@ -188,13 +195,15 @@ template<> struct packet_traits : default_packet_traits enum { Vectorizable = 1, AlignedOnScalar = 1, + HasCmp = 1, + HasDiv=1, size=4, HasShift = 1, HasBlend = 1 }; }; - +#endif template<> struct packet_traits : default_packet_traits { typedef Packet16b type; @@ -204,7 +213,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, HasHalfPacket = 0, size=16, - + HasAdd = 1, HasSub = 1, HasShift = 0, @@ -215,7 +224,8 @@ template<> struct packet_traits : default_packet_traits HasMin = 0, HasMax = 0, HasConj = 0, - HasSqrt = 1 + HasSqrt = 1, + HasSign = 0 // Don't try to vectorize psign = identity. }; }; @@ -233,7 +243,7 @@ template<> struct unpacket_traits { template<> struct unpacket_traits { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; template<> struct unpacket_traits { typedef bool type; @@ -246,18 +256,9 @@ template<> struct scalar_div_cost { enum { value = 7 }; }; template<> struct scalar_div_cost { enum { value = 8 }; }; #endif -#if EIGEN_COMP_MSVC==1500 -// Workaround MSVC 9 internal compiler error. -// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode -// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps(from,from,from,from); } -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } -#else template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } -#endif template<> EIGEN_STRONG_INLINE Packet16b pset1(const bool& from) { return _mm_set1_epi8(static_cast(from)); } template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } @@ -292,6 +293,10 @@ template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet16b padd(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } +template EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b); +template<> EIGEN_STRONG_INLINE Packet4f padds(const Packet4f& a, const Packet4f& b) { return _mm_add_ss(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d padds(const Packet2d& a, const Packet2d& b) { return _mm_add_sd(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } @@ -366,11 +371,37 @@ template<> EIGEN_STRONG_INLINE Packet16b pmul(const Packet16b& a, con template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, + const Packet4i& b) { +#ifdef EIGEN_VECTORIZE_AVX + return _mm256_cvttpd_epi32( + _mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b))); +#else + __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b))); + __m128i q_hi = + _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), + _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1)))); + return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3); +#endif +} + + // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } #ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmsub_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmsub_pd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmadd_pd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmsub_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmsub_pd(a,b,c); } + +template EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c); +template<> EIGEN_STRONG_INLINE Packet4f pmadds(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ss(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadds(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_sd(a,b,c); } #endif #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -444,7 +475,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packe template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -463,7 +494,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -494,7 +525,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -513,7 +544,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -618,10 +649,21 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31)); } +template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) +{ + Packet4f tmp = psignbit(_mm_castpd_ps(a)); +#ifdef EIGEN_VECTORIZE_AVX + return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask))); +#else + return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask))); +#endif // EIGEN_VECTORIZE_AVX +} + #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { - // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round. + // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round. const Packet4f mask = pset1frombits(0x80000000u); const Packet4f prev0dot5 = pset1frombits(0x3EFFFFFFu); return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); @@ -720,15 +762,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pload(const bool* from) #if EIGEN_COMP_MSVC template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD - #if (EIGEN_COMP_MSVC==1600) - // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps - // (i.e., it does not generate an unaligned load!! - __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); - res = _mm_loadh_pi(res, (const __m64*)(from+2)); - return res; - #else return _mm_loadu_ps(from); - #endif } #else // NOTE: with the code below, MSVC's compiler crashes! @@ -755,6 +789,15 @@ template<> EIGEN_STRONG_INLINE Packet16b ploadu(const bool* from) return _mm_loadu_si128(reinterpret_cast(from)); } +// Load lower part of packet zero extending. +template EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits::type* from); +template<> EIGEN_STRONG_INLINE Packet4f ploadl(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(from))); } +template<> EIGEN_STRONG_INLINE Packet2d ploadl(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); } + +// Load scalar +template EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits::type* from); +template<> EIGEN_STRONG_INLINE Packet4f ploads(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploads(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { @@ -796,6 +839,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from); +template<> EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from); } + +template EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from); +template<> EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from); } +template<> EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from); } + template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); @@ -1120,6 +1171,11 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return _mm_movemask_ps(x) != 0x0; } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) +{ + return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); @@ -1278,8 +1334,126 @@ template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { return ::fma(a,b,c); } +template<> EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) { + return ::fmaf(a,b,-c); +} +template<> EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) { + return ::fma(a,b,-c); +} +template<> EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) { + return ::fmaf(-a,b,c); +} +template<> EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) { + return ::fma(-a,b,c); +} +template<> EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) { + return ::fmaf(-a,b,-c); +} +template<> EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) { + return ::fma(-a,b,-c); +} #endif +#ifdef EIGEN_VECTORIZE_SSE4_1 +// Helpers for half->float and float->half conversions. +// Currently only used by the AVX code. +EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) { + __m128i input = _mm_cvtepu16_epi32(h); + + // Direct vectorization of half_to_float, C parts in the comments. + __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13); + // o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13); + // exp = shifted_exp & o.u; // just the exponent + __m128i exp = _mm_and_si128(ou, shifted_exp); + // o.u += (127 - 15) << 23; + ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23)); + + // Inf/NaN? + __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp); + // Inf/NaN adjust + __m128i naninf_adj = + _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask); + // extra exp adjust for Inf/NaN + ou = _mm_add_epi32(ou, naninf_adj); + + // Zero/Denormal? + __m128i zeroden_mask = _mm_cmpeq_epi32(exp, _mm_setzero_si128()); + __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23)); + // o.u += 1 << 23; + ou = _mm_add_epi32(ou, zeroden_adj); + // magic.u = 113 << 23 + __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23)); + // o.f -= magic.f + ou = _mm_castps_si128( + _mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic))); + + __m128i sign = + _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16); + // o.u |= (h.x & 0x8000) << 16; // sign bit + ou = _mm_or_si128(ou, sign); + // return o.f; + // We are actually returning uint version, to make + // _mm256_insertf128_si256 work. + return ou; +} + +EIGEN_STRONG_INLINE __m128i float2half(__m128 f) { + __m128i o = _mm_setzero_si128(); + + // unsigned int sign_mask = 0x80000000u; + __m128i sign = _mm_set1_epi32(0x80000000u); + // unsigned int sign = f.u & sign_mask; + sign = _mm_and_si128(sign, _mm_castps_si128(f)); + // f.u ^= sign; + f = _mm_xor_ps(f, _mm_castsi128_ps(sign)); + + __m128i fu = _mm_castps_si128(f); + + __m128i f16max = _mm_set1_epi32((127 + 16) << 23); + __m128i f32infty = _mm_set1_epi32(255 << 23); + // if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set) + // there is no _mm_cmpge_epi32, so use lt and swap operands + __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f)); + __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty); + __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask); + __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00)); + __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00)); + // o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + __m128i naninf_value = _mm_or_si128(inf_value, nan_value); + + __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23); + __m128i subnorm_mask = + _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23)); + // f.f += denorm_magic.f; + f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic)); + // f.u - denorm_magic.u + o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic); + o = _mm_and_si128(o, subnorm_mask); + // Correct result for inf/nan/zero/subnormal, 0 otherwise + o = _mm_or_si128(o, naninf_value); + + __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask); + o = _mm_and_si128(o, mask); + + // mant_odd = (f.u >> 13) & 1; + __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1)); + // f.u += 0xc8000fffU; + fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU)); + // f.u += mant_odd; + fu = _mm_add_epi32(fu, mand_odd); + fu = _mm_andnot_si128(mask, fu); + // f.u >> 13 + fu = _mm_srli_epi32(fu, 13); + o = _mm_or_si128(fu, o); + + // o.x |= static_cast(sign >> 16); + o = _mm_or_si128(o, _mm_srli_epi32(sign, 16)); + + // 16 bit values + return _mm_and_si128(o, _mm_set1_epi32(0xffff)); +} +#endif // Packet math for Eigen::half // Disable the following code since it's broken on too many platforms / compilers. diff --git a/libs/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h index d2a0037..a6346ea 100644 --- a/libs/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TYPE_CASTING_SSE_H #define EIGEN_TYPE_CASTING_SSE_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -69,6 +71,14 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f return _mm_cvtps_pd(a); } +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4f& a) { + return _mm_castps_pd(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet2d& a) { + return _mm_castpd_ps(a); +} + template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { return _mm_castps_si128(a); } diff --git a/libs/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h index b139ea2..8b588b1 100644 --- a/libs/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h @@ -10,32 +10,34 @@ #ifndef EIGEN_MATH_FUNCTIONS_SVE_H #define EIGEN_MATH_FUNCTIONS_SVE_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { template <> -EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pexp(const PacketXf& x) { +EIGEN_STRONG_INLINE PacketXf pexp(const PacketXf& x) { return pexp_float(x); } template <> -EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf plog(const PacketXf& x) { +EIGEN_STRONG_INLINE PacketXf plog(const PacketXf& x) { return plog_float(x); } template <> -EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf psin(const PacketXf& x) { +EIGEN_STRONG_INLINE PacketXf psin(const PacketXf& x) { return psin_float(x); } template <> -EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pcos(const PacketXf& x) { +EIGEN_STRONG_INLINE PacketXf pcos(const PacketXf& x) { return pcos_float(x); } // Hyperbolic Tangent function. template <> -EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf ptanh(const PacketXf& x) { +EIGEN_STRONG_INLINE PacketXf ptanh(const PacketXf& x) { return internal::generic_fast_tanh_float(x); } } // end namespace internal diff --git a/libs/eigen/Eigen/src/Core/arch/SVE/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/SVE/PacketMath.h index 9060b37..9c106b3 100644 --- a/libs/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/SVE/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_SVE_H #define EIGEN_PACKET_MATH_SVE_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal @@ -149,7 +151,7 @@ EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b template <> EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) { - return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu); + return svdup_n_s32_z(svcmple_s32(svptrue_b32(), a, b), 0xffffffffu); } template <> @@ -209,13 +211,13 @@ EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) template EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { - return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N))); + return svreinterpret_s32_u32(svlsr_n_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), N)); } template EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) { - return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N)); + return svlsl_n_s32_z(svptrue_b32(), a, N); } template <> @@ -523,7 +525,7 @@ EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, template <> EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { - return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu)); + return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(), a, b), 0xffffffffu)); } template <> diff --git a/libs/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h index 7ba5d9c..1067a41 100644 --- a/libs/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TYPE_CASTING_SVE_H #define EIGEN_TYPE_CASTING_SVE_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/libs/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h index 10856ff..57bfb69 100644 --- a/libs/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/libs/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -21,6 +21,8 @@ #ifndef EIGEN_INTEROP_HEADERS_SYCL_H #define EIGEN_INTEROP_HEADERS_SYCL_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { #if !defined(EIGEN_DONT_VECTORIZE_SYCL) diff --git a/libs/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h index 2ab0f2a..9eb46bb 100644 --- a/libs/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h @@ -20,6 +20,8 @@ #ifndef EIGEN_MATH_FUNCTIONS_SYCL_H #define EIGEN_MATH_FUNCTIONS_SYCL_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h index 87badc0..5bc3235 100644 --- a/libs/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -21,6 +21,8 @@ #ifndef EIGEN_PACKET_MATH_SYCL_H #define EIGEN_PACKET_MATH_SYCL_H #include +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -475,25 +477,19 @@ pabs(const cl::sycl::cl_double2& a) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a, const Packet &b) { - return ((a <= b) - .template convert::type, - cl::sycl::rounding_mode::automatic>()); + return (a <= b).template as(); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a, const Packet &b) { - return ((a < b) - .template convert::type, - cl::sycl::rounding_mode::automatic>()); + return (a < b).template as(); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a, const Packet &b) { - return ((a == b) - .template convert::type, - cl::sycl::rounding_mode::automatic>()); + return (a == b).template as(); } #define SYCL_PCMP(OP, TYPE) \ @@ -511,76 +507,6 @@ SYCL_PCMP(lt, cl::sycl::cl_double2) SYCL_PCMP(eq, cl::sycl::cl_double2) #undef SYCL_PCMP -template struct convert_to_integer; - -template <> struct convert_to_integer { - using type = std::int32_t; - using packet_type = cl::sycl::cl_int4; -}; -template <> struct convert_to_integer { - using type = std::int64_t; - using packet_type = cl::sycl::cl_long2; -}; - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename convert_to_integer< - typename unpacket_traits::type>::packet_type -vector_as_int(const PacketIn &p) { - return ( - p.template convert::type>::type, - cl::sycl::rounding_mode::automatic>()); -} - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packetOut -convert_vector(const PacketIn &p) { - return (p.template convert::type, - cl::sycl::rounding_mode::automatic>()); -} - -#define SYCL_PAND(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pand(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) & vector_as_int(b)); \ - } -SYCL_PAND(cl::sycl::cl_float4) -SYCL_PAND(cl::sycl::cl_double2) -#undef SYCL_PAND - -#define SYCL_POR(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE por(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) | vector_as_int(b)); \ - } - -SYCL_POR(cl::sycl::cl_float4) -SYCL_POR(cl::sycl::cl_double2) -#undef SYCL_POR - -#define SYCL_PXOR(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pxor(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) ^ vector_as_int(b)); \ - } - -SYCL_PXOR(cl::sycl::cl_float4) -SYCL_PXOR(cl::sycl::cl_double2) -#undef SYCL_PXOR - -#define SYCL_PANDNOT(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pandnot(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) & (~vector_as_int(b))); \ - } -SYCL_PANDNOT(cl::sycl::cl_float4) -SYCL_PANDNOT(cl::sycl::cl_double2) -#undef SYCL_PANDNOT - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( PacketBlock& kernel) { float tmp = kernel.packet[0].y(); diff --git a/libs/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/libs/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h index f81e59d..54eedfa 100644 --- a/libs/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +++ b/libs/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h @@ -33,6 +33,8 @@ #include #include +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace TensorSycl { namespace internal { @@ -139,7 +141,7 @@ class PointerMapper { /* basic type for all buffers */ - using buffer_t = cl::sycl::buffer_mem; + using buffer_t = cl::sycl::buffer; /** * Node that stores information about a device allocation. @@ -166,7 +168,7 @@ class PointerMapper { /** * Obtain the insertion point in the pointer map for * a pointer of the given size. - * \param requiredSize Size attemted to reclaim + * \param requiredSize Size attempted to reclaim */ typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) { typename pointerMap_t::iterator retVal; @@ -235,17 +237,14 @@ class PointerMapper { template cl::sycl::buffer get_buffer( const virtual_pointer_t ptr) { - using sycl_buffer_t = cl::sycl::buffer; - // get_node() returns a `buffer_mem`, so we need to cast it to a `buffer<>`. - // We can do this without the `buffer_mem` being a pointer, as we - // only declare member variables in the base class (`buffer_mem`) and not in - // the child class (`buffer<>). auto node = get_node(ptr); + auto& map_node = node->second; eigen_assert(node->first == ptr || node->first < ptr); - eigen_assert(ptr < static_cast(node->second.m_size + + eigen_assert(ptr < static_cast(map_node.m_size + node->first)); - return *(static_cast(&node->second.m_buffer)); + return map_node.m_buffer.reinterpret( + cl::sycl::range<1>{map_node.m_size / sizeof(buffer_data_type)}); } /** @@ -427,8 +426,11 @@ class PointerMapper { template virtual_pointer_t add_pointer_impl(BufferT b) { virtual_pointer_t retVal = nullptr; - size_t bufSize = b.get_count(); - pMapNode_t p{b, bufSize, false}; + size_t bufSize = b.get_count() * sizeof(buffer_data_type_t); + auto byte_buffer = + b.template reinterpret(cl::sycl::range<1>{bufSize}); + pMapNode_t p{byte_buffer, bufSize, false}; + // If this is the first pointer: if (m_pointerMap.empty()) { virtual_pointer_t initialVal{m_baseAddress}; diff --git a/libs/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h b/libs/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h index 9208ab2..613e823 100644 --- a/libs/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +++ b/libs/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h @@ -21,6 +21,8 @@ #ifndef EIGEN_TYPE_CASTING_SYCL_H #define EIGEN_TYPE_CASTING_SYCL_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/arch/ZVector/Complex.h b/libs/eigen/Eigen/src/Core/arch/ZVector/Complex.h index 0b9b33d..df5c8d4 100644 --- a/libs/eigen/Eigen/src/Core/arch/ZVector/Complex.h +++ b/libs/eigen/Eigen/src/Core/arch/ZVector/Complex.h @@ -8,8 +8,10 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_COMPLEX32_ALTIVEC_H -#define EIGEN_COMPLEX32_ALTIVEC_H +#ifndef EIGEN_COMPLEX32_ZVECTOR_H +#define EIGEN_COMPLEX32_ZVECTOR_H + +#include "../../InternalHeaderCheck.h" namespace Eigen { @@ -91,8 +93,18 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet2cf half; + typedef Packet4f as_real; +}; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet1cd half; + typedef Packet2d as_real; +}; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); @@ -150,7 +162,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); return res; @@ -169,10 +181,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for AltiVec - Packet1cd res = pmul(a,pconj(b)); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) @@ -195,7 +204,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; @@ -225,14 +234,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -308,11 +317,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec - Packet2cf res; - res.cd[0] = pdiv(a.cd[0], b.cd[0]); - res.cd[1] = pdiv(a.cd[1], b.cd[1]); - return res; + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) @@ -394,10 +399,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec - Packet2cf res = pmul(a, pconj(b)); - Packet4f s = pmul(b.v, b.v); - return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) @@ -423,4 +425,4 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con } // end namespace Eigen -#endif // EIGEN_COMPLEX32_ALTIVEC_H +#endif // EIGEN_COMPLEX32_ZVECTOR_H diff --git a/libs/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h b/libs/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h index 1635e12..1f2da26 100644 --- a/libs/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/libs/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -13,79 +13,81 @@ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ -#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H -#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#ifndef EIGEN_MATH_FUNCTIONS_ZVECTOR_H +#define EIGEN_MATH_FUNCTIONS_ZVECTOR_H + +#include "../../InternalHeaderCheck.h" namespace Eigen { namespace internal { #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); -static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); -static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); -static _EIGEN_DECLARE_CONST_Packet4i(23, 23); +static EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static EIGEN_DECLARE_CONST_Packet4i(23, 23); -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); +static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); /* the smallest non denormalized float number */ -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); +static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); /* natural logarithm computed for 4 simultaneous float return NaN for x <= 0 */ -static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); -static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); -static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); +static EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); #endif -static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); -static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); -static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); +static EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static EIGEN_DECLARE_CONST_Packet2d(half, 0.5); -static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); -static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); +static EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); +static EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& _x) { Packet2d x = _x; @@ -136,7 +138,7 @@ Packet2d pexp(const Packet2d& _x) isnumber_mask); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp(const Packet4f& _x) { #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) @@ -183,13 +185,13 @@ Packet4f pexp(const Packet4f& _x) #endif } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt(const Packet2d& x) { return vec_sqrt(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) { Packet4f res; @@ -202,12 +204,12 @@ Packet4f psqrt(const Packet4f& x) return res; } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt(const Packet2d& x) { return pset1(1.0) / psqrt(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt(const Packet4f& x) { Packet4f res; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) @@ -221,7 +223,7 @@ Packet4f prsqrt(const Packet4f& x) { // Hyperbolic Tangent function. template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh(const Packet4f& x) { return internal::generic_fast_tanh_float(x); } @@ -230,4 +232,4 @@ ptanh(const Packet4f& x) { } // end namespace Eigen -#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#endif // EIGEN_MATH_FUNCTIONS_ZVECTOR_H diff --git a/libs/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/libs/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h old mode 100755 new mode 100644 index 1f55a90..26b6f0d --- a/libs/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/libs/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -10,6 +10,8 @@ #ifndef EIGEN_PACKET_MATH_ZVECTOR_H #define EIGEN_PACKET_MATH_ZVECTOR_H +#include "../../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -64,48 +66,48 @@ typedef union { // We don't want to write the same code all the time, but we need to reuse the constants // and it doesn't really work to declare them global, so we define macros instead -#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = reinterpret_cast(vec_splat_s32(X)) -#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \ Packet2d p2d_##NAME = reinterpret_cast(vec_splat_s64(X)) -#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \ Packet2l p2l_##NAME = reinterpret_cast(vec_splat_s64(X)) -#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ Packet2d p2d_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ Packet2l p2l_##NAME = pset1(X) // These constants are endian-agnostic -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} +static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} -static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); -static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); -static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); +static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); +static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); +static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO_ = { numext::bit_cast0x8000000000000000ull), - numext::bit_cast0x8000000000000000ull) }; +static Packet2d p2d_ZERO_ = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ +#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = pset1(X) -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ +#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) -static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; #endif @@ -117,9 +119,9 @@ static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; // Mask alignment -#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 +#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 -#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) +#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT) // Handle endianness properly while loading constants // Define global static constants: @@ -358,7 +360,7 @@ pbroadcast4(const double *a, template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -368,7 +370,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); @@ -376,7 +378,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; pstore((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -386,7 +388,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -460,8 +462,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { @@ -639,7 +641,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -649,7 +651,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; pstore((float *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -785,7 +787,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) return p; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -943,7 +945,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; af[0] = from[0*stride]; af[1] = from[1*stride]; af[2] = from[2*stride]; @@ -953,7 +955,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; pstore((float*)af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -978,7 +980,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { r template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { diff --git a/libs/eigen/Eigen/src/Core/functors/AssignmentFunctors.h b/libs/eigen/Eigen/src/Core/functors/AssignmentFunctors.h index bf64ef4..c9d80e6 100644 --- a/libs/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/libs/eigen/Eigen/src/Core/functors/AssignmentFunctors.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ASSIGNMENT_FUNCTORS_H #define EIGEN_ASSIGNMENT_FUNCTORS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -20,9 +22,8 @@ namespace internal { */ template struct assign_op { - EIGEN_EMPTY_STRUCT_CTOR(assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; } - + template EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a,b); } @@ -45,9 +46,8 @@ struct functor_traits > { */ template struct add_assign_op { - EIGEN_EMPTY_STRUCT_CTOR(add_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; } - + template EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a,internal::padd(internal::ploadt(a),b)); } @@ -66,9 +66,8 @@ struct functor_traits > { */ template struct sub_assign_op { - EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; } - + template EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a,internal::psub(internal::ploadt(a),b)); } @@ -88,9 +87,8 @@ struct functor_traits > { template struct mul_assign_op { - EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; } - + template EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a,internal::pmul(internal::ploadt(a),b)); } @@ -109,9 +107,8 @@ struct functor_traits > { */ template struct div_assign_op { - EIGEN_EMPTY_STRUCT_CTOR(div_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; } - + template EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a,internal::pdiv(internal::ploadt(a),b)); } @@ -141,7 +138,6 @@ struct functor_traits > { */ template struct swap_assign_op { - EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { #ifdef EIGEN_GPUCC diff --git a/libs/eigen/Eigen/src/Core/functors/BinaryFunctors.h b/libs/eigen/Eigen/src/Core/functors/BinaryFunctors.h index 63f09ab..c8bb4e7 100644 --- a/libs/eigen/Eigen/src/Core/functors/BinaryFunctors.h +++ b/libs/eigen/Eigen/src/Core/functors/BinaryFunctors.h @@ -10,6 +10,8 @@ #ifndef EIGEN_BINARY_FUNCTORS_H #define EIGEN_BINARY_FUNCTORS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -32,9 +34,7 @@ template struct scalar_sum_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) -#else +#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN scalar_sum_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } @@ -70,9 +70,7 @@ template struct scalar_product_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) -#else +#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } @@ -110,13 +108,12 @@ struct scalar_conj_product_op : binary_op_base enum { Conj = NumTraits::IsComplex }; - + typedef typename ScalarBinaryOpTraits::ReturnType result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return conj_helper().pmul(a,b); } - + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return conj_helper().pmul(a,b); } @@ -138,7 +135,6 @@ template struct scalar_min_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return internal::pmin(a, b); } @@ -171,7 +167,6 @@ template struct scalar_max_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return internal::pmax(a,b); } @@ -205,7 +200,11 @@ template struct functor_traits > { enum { Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, - PacketAccess = false + PacketAccess = is_same::value && + packet_traits::HasCmp && + // Since return type is bool, we currently require the inputs + // to be bool to enable packet access. + is_same::value }; }; @@ -219,50 +218,64 @@ template struct scalar_cmp_op : binary_op_base { typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pcmp_eq(a,b); } }; template struct scalar_cmp_op : binary_op_base { typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pcmp_lt(a,b); } }; template struct scalar_cmp_op : binary_op_base { typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pcmp_le(a,b); } }; template struct scalar_cmp_op : binary_op_base { typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pcmp_lt(b,a); } }; template struct scalar_cmp_op : binary_op_base { typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pcmp_le(b,a); } }; template struct scalar_cmp_op : binary_op_base { typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pcmp_eq(internal::por(internal::pcmp_le(a, b), internal::pcmp_le(b, a)), internal::pzero(a)); } }; template struct scalar_cmp_op : binary_op_base { typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pcmp_eq(internal::pcmp_eq(a, b), internal::pzero(a)); } }; /** \internal @@ -273,8 +286,6 @@ struct scalar_cmp_op : binary_op_base struct scalar_hypot_op : binary_op_base { - EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const { // This functor is used by hypotNorm only for which it is faster to first apply abs @@ -304,9 +315,7 @@ template struct scalar_pow_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op) -#else +#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN scalar_pow_op() { typedef Scalar LhsScalar; typedef Exponent RhsScalar; @@ -331,7 +340,7 @@ struct functor_traits > { PacketAccess = (!NumTraits::IsComplex && !NumTraits::IsInteger && packet_traits::HasExp && packet_traits::HasLog && packet_traits::HasRound && packet_traits::HasCmp && - // Temporarly disable packet access for half/bfloat16 until + // Temporarily disable packet access for half/bfloat16 until // accuracy is improved. !is_same::value && !is_same::value ) @@ -349,9 +358,7 @@ template struct scalar_difference_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) -#else +#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN scalar_difference_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } @@ -369,6 +376,28 @@ struct functor_traits > { }; }; +template ::type>::IsInteger> +struct maybe_raise_div_by_zero { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) { + EIGEN_UNUSED_VARIABLE(x); + } +}; + +#ifndef EIGEN_GPU_COMPILE_PHASE +template +struct maybe_raise_div_by_zero { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) { + if (EIGEN_PREDICT_FALSE(predux_any(pcmp_eq(x, pzero(x))))) { + // Use volatile variables to force a division by zero, which will + // result in the default platform behaviour (usually SIGFPE). + volatile typename unpacket_traits::type zero = 0; + volatile typename unpacket_traits::type val = 1; + val = val / zero; + } + } +}; +#endif + /** \internal * \brief Template functor to compute the quotient of two scalars * @@ -378,17 +407,17 @@ template struct scalar_quotient_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) -#else +#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN scalar_quotient_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pdiv(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { + maybe_raise_div_by_zero::run(b); + return internal::pdiv(a,b); + } }; template struct functor_traits > { @@ -407,7 +436,6 @@ struct functor_traits > { * \sa class CwiseBinaryOp, ArrayBase::operator&& */ struct scalar_boolean_and_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const @@ -426,7 +454,6 @@ template<> struct functor_traits { * \sa class CwiseBinaryOp, ArrayBase::operator|| */ struct scalar_boolean_or_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const @@ -445,7 +472,6 @@ template<> struct functor_traits { * \sa class CwiseBinaryOp, ArrayBase::operator^ */ struct scalar_boolean_xor_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const @@ -467,9 +493,7 @@ template struct scalar_absolute_difference_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op) -#else +#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN scalar_absolute_difference_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } @@ -489,6 +513,73 @@ struct functor_traits > { }; +template +struct scalar_atan2_op { + using Scalar = LhsScalar; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t::value, Scalar> + operator()(const Scalar& y, const Scalar& x) const { + EIGEN_USING_STD(atan2); + return static_cast(atan2(y, x)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::value, Packet> + packetOp(const Packet& y, const Packet& x) const { + // See https://en.cppreference.com/w/cpp/numeric/math/atan2 + // for how corner cases are supposed to be handled according to the + // IEEE floating-point standard (IEC 60559). + const Packet kSignMask = pset1(-Scalar(0)); + const Packet kPi = pset1(Scalar(EIGEN_PI)); + const Packet kPiO2 = pset1(Scalar(EIGEN_PI / 2)); + const Packet kPiO4 = pset1(Scalar(EIGEN_PI / 4)); + const Packet k3PiO4 = pset1(Scalar(3.0 * (EIGEN_PI / 4))); + + // Various predicates about the inputs. + Packet x_signbit = pand(x, kSignMask); + Packet x_has_signbit = pcmp_lt(por(x_signbit, kPi), pzero(x)); + Packet x_is_zero = pcmp_eq(x, pzero(x)); + Packet x_neg = pandnot(x_has_signbit, x_is_zero); + + Packet y_signbit = pand(y, kSignMask); + Packet y_is_zero = pcmp_eq(y, pzero(y)); + Packet x_is_not_nan = pcmp_eq(x, x); + Packet y_is_not_nan = pcmp_eq(y, y); + + // Compute the normal case. Notice that we expect that + // finite/infinite = +/-0 here. + Packet result = patan(pdiv(y, x)); + + // Compute shift for when x != 0 and y != 0. + Packet shift = pselect(x_neg, por(kPi, y_signbit), pzero(x)); + + // Special cases: + // Handle x = +/-inf && y = +/-inf. + Packet is_not_nan = pcmp_eq(result, result); + result = + pselect(is_not_nan, padd(shift, result), + pselect(x_neg, por(k3PiO4, y_signbit), por(kPiO4, y_signbit))); + // Handle x == +/-0. + result = pselect( + x_is_zero, pselect(y_is_zero, pzero(y), por(y_signbit, kPiO2)), result); + // Handle y == +/-0. + result = pselect( + y_is_zero, + pselect(x_has_signbit, por(y_signbit, kPi), por(y_signbit, pzero(y))), + result); + // Handle NaN inputs. + Packet kQNaN = pset1(NumTraits::quiet_NaN()); + return pselect(pand(x_is_not_nan, y_is_not_nan), result, kQNaN); + } +}; + +template + struct functor_traits> { + enum { + PacketAccess = is_same::value && packet_traits::HasATan && packet_traits::HasDiv && !NumTraits::IsInteger && !NumTraits::IsComplex, + Cost = + scalar_div_cost::value + 5 * NumTraits::MulCost + 5 * NumTraits::AddCost + }; +}; //---------- binary functors bound to a constant, thus appearing as a unary functor ---------- diff --git a/libs/eigen/Eigen/src/Core/functors/NullaryFunctors.h b/libs/eigen/Eigen/src/Core/functors/NullaryFunctors.h index 192f225..4943d87 100644 --- a/libs/eigen/Eigen/src/Core/functors/NullaryFunctors.h +++ b/libs/eigen/Eigen/src/Core/functors/NullaryFunctors.h @@ -10,6 +10,8 @@ #ifndef EIGEN_NULLARY_FUNCTORS_H #define EIGEN_NULLARY_FUNCTORS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -29,7 +31,6 @@ struct functor_traits > PacketAccess = packet_traits::Vectorizable, IsRepeatable = true }; }; template struct scalar_identity_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType row, IndexType col) const { return row==col ? Scalar(1) : Scalar(0); } }; @@ -144,6 +145,39 @@ template struct linspaced_op const linspaced_op_impl::IsInteger> impl; }; +template +struct equalspaced_op { + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC equalspaced_op(const Scalar& start, const Scalar& step) : m_start(start), m_step(step) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const { + return m_start + m_step * static_cast(i); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(IndexType i) const { + const Packet cst_start = pset1(m_start); + const Packet cst_step = pset1(m_step); + const Packet cst_lin0 = plset(Scalar(0)); + const Packet cst_offset = pmadd(cst_lin0, cst_step, cst_start); + + Packet i_packet = pset1(static_cast(i)); + return pmadd(i_packet, cst_step, cst_offset); + } + const Scalar m_start; + const Scalar m_step; +}; + +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost + NumTraits::MulCost, + PacketAccess = + packet_traits::HasSetLinear && packet_traits::HasMul && packet_traits::HasAdd, + IsRepeatable = true + }; +}; + // Linear access is automatically determined from the operator() prototypes available for the given functor. // If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently // and linear access is not possible. In all other cases, linear access is enabled. @@ -152,7 +186,7 @@ template struct functor_has_linear_access { enum { ret = !has_ // For unreliable compilers, let's specialize the has_*ary_operator // helpers so that at least built-in nullary functors work fine. -#if !( (EIGEN_COMP_MSVC>1600) || (EIGEN_GNUC_AT_LEAST(4,8)) || (EIGEN_COMP_ICC>=1600)) +#if !( EIGEN_COMP_MSVC || EIGEN_COMP_GNUC || (EIGEN_COMP_ICC>=1600)) template struct has_nullary_operator,IndexType> { enum { value = 1}; }; template diff --git a/libs/eigen/Eigen/src/Core/functors/StlFunctors.h b/libs/eigen/Eigen/src/Core/functors/StlFunctors.h index 4570c9b..5971075 100644 --- a/libs/eigen/Eigen/src/Core/functors/StlFunctors.h +++ b/libs/eigen/Eigen/src/Core/functors/StlFunctors.h @@ -10,6 +10,8 @@ #ifndef EIGEN_STL_FUNCTORS_H #define EIGEN_STL_FUNCTORS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { // Portable replacements for certain functors. @@ -102,17 +104,6 @@ template struct functor_traits > : functor_traits > {}; -#if (EIGEN_COMP_CXXVER < 11) -// std::binder* are deprecated since c++11 and will be removed in c++17 -template -struct functor_traits > -{ enum { Cost = functor_traits::Cost, PacketAccess = false }; }; - -template -struct functor_traits > -{ enum { Cost = functor_traits::Cost, PacketAccess = false }; }; -#endif - #if (EIGEN_COMP_CXXVER < 17) // std::unary_negate is deprecated since c++17 and will be removed in c++20 template diff --git a/libs/eigen/Eigen/src/Core/functors/TernaryFunctors.h b/libs/eigen/Eigen/src/Core/functors/TernaryFunctors.h index b254e96..41c0d5f 100644 --- a/libs/eigen/Eigen/src/Core/functors/TernaryFunctors.h +++ b/libs/eigen/Eigen/src/Core/functors/TernaryFunctors.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TERNARY_FUNCTORS_H #define EIGEN_TERNARY_FUNCTORS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/functors/UnaryFunctors.h b/libs/eigen/Eigen/src/Core/functors/UnaryFunctors.h index 16136d1..3485369 100644 --- a/libs/eigen/Eigen/src/Core/functors/UnaryFunctors.h +++ b/libs/eigen/Eigen/src/Core/functors/UnaryFunctors.h @@ -10,6 +10,8 @@ #ifndef EIGEN_UNARY_FUNCTORS_H #define EIGEN_UNARY_FUNCTORS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -20,7 +22,6 @@ namespace internal { * \sa class CwiseUnaryOp, MatrixBase::operator- */ template struct scalar_opposite_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const @@ -39,7 +40,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, Cwise::abs */ template struct scalar_abs_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op) typedef typename NumTraits::Real result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs(a); } template @@ -70,14 +70,12 @@ struct functor_traits > : functor_traits struct abs_knowing_score { - EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) typedef typename NumTraits::Real result_type; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { return numext::abs(a); } }; template struct abs_knowing_score::Score_is_abs> { - EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) typedef typename NumTraits::Real result_type; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; } @@ -89,7 +87,6 @@ template struct abs_knowing_score struct scalar_abs2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op) typedef typename NumTraits::Real result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); } @@ -107,7 +104,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, MatrixBase::conjugate() */ template struct scalar_conjugate_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); } template @@ -136,7 +132,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, Cwise::arg */ template struct scalar_arg_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op) typedef typename NumTraits::Real result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); } template @@ -158,7 +153,6 @@ struct functor_traits > */ template struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef NewType result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast(a); } }; @@ -173,7 +167,6 @@ struct functor_traits > */ template struct scalar_shift_right_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return a >> N; } @@ -192,8 +185,6 @@ struct functor_traits > */ template struct scalar_shift_left_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return a << N; } template @@ -211,7 +202,6 @@ struct functor_traits > */ template struct scalar_real_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op) typedef typename NumTraits::Real result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); } @@ -227,7 +217,6 @@ struct functor_traits > */ template struct scalar_imag_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op) typedef typename NumTraits::Real result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); } @@ -243,7 +232,6 @@ struct functor_traits > */ template struct scalar_real_ref_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op) typedef typename NumTraits::Real result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast(&a)); } @@ -259,7 +247,6 @@ struct functor_traits > */ template struct scalar_imag_ref_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op) typedef typename NumTraits::Real result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast(&a)); } @@ -275,8 +262,7 @@ struct functor_traits > * \sa class CwiseUnaryOp, Cwise::exp() */ template struct scalar_exp_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::exp(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return internal::pexp(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } }; @@ -315,7 +301,6 @@ struct functor_traits > { * \sa class CwiseUnaryOp, ArrayBase::expm1() */ template struct scalar_expm1_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); } @@ -335,7 +320,6 @@ struct functor_traits > { * \sa class CwiseUnaryOp, ArrayBase::log() */ template struct scalar_log_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } @@ -366,7 +350,6 @@ struct functor_traits > { * \sa class CwiseUnaryOp, ArrayBase::log1p() */ template struct scalar_log1p_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_log1p_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log1p(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog1p(a); } @@ -386,7 +369,6 @@ struct functor_traits > { * \sa class CwiseUnaryOp, Cwise::log10() */ template struct scalar_log10_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); } @@ -402,7 +384,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, Cwise::log2() */ template struct scalar_log2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); } @@ -416,7 +397,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, Cwise::sqrt() */ template struct scalar_sqrt_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sqrt(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } @@ -440,7 +420,6 @@ struct functor_traits > { // Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings. template<> struct scalar_sqrt_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } template EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; } @@ -455,7 +434,6 @@ struct functor_traits > { * \sa class CwiseUnaryOp, Cwise::rsqrt() */ template struct scalar_rsqrt_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); } @@ -474,7 +452,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::cos() */ template struct scalar_cos_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op) EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); } @@ -493,7 +470,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::sin() */ template struct scalar_sin_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); } @@ -513,7 +489,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::tan() */ template struct scalar_tan_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); } @@ -532,7 +507,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::acos() */ template struct scalar_acos_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); } @@ -551,7 +525,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::asin() */ template struct scalar_asin_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); } @@ -571,7 +544,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::atan() */ template struct scalar_atan_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); } @@ -591,7 +563,6 @@ struct functor_traits > */ template struct scalar_tanh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op) EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return ptanh(x); } @@ -622,14 +593,12 @@ struct functor_traits > { }; }; -#if EIGEN_HAS_CXX11_MATH /** \internal * \brief Template functor to compute the atanh of a scalar * \sa class CwiseUnaryOp, ArrayBase::atanh() */ template struct scalar_atanh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op) EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } }; @@ -637,14 +606,12 @@ template struct functor_traits > { enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; }; -#endif /** \internal * \brief Template functor to compute the sinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::sinh() */ template struct scalar_sinh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); } @@ -658,14 +625,12 @@ struct functor_traits > }; }; -#if EIGEN_HAS_CXX11_MATH /** \internal * \brief Template functor to compute the asinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::asinh() */ template struct scalar_asinh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op) EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); } }; @@ -673,14 +638,12 @@ template struct functor_traits > { enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; }; -#endif /** \internal * \brief Template functor to compute the cosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::cosh() */ template struct scalar_cosh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); } @@ -694,14 +657,12 @@ struct functor_traits > }; }; -#if EIGEN_HAS_CXX11_MATH /** \internal * \brief Template functor to compute the acosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::acosh() */ template struct scalar_acosh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op) EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); } }; @@ -709,7 +670,6 @@ template struct functor_traits > { enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; }; -#endif /** \internal * \brief Template functor to compute the inverse of a scalar @@ -717,17 +677,21 @@ struct functor_traits > { */ template struct scalar_inverse_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op) EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; } template EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const - { return internal::pdiv(pset1(Scalar(1)),a); } + { return internal::preciprocal(a); } }; template struct functor_traits > { enum { PacketAccess = packet_traits::HasDiv, - Cost = scalar_div_cost::value + // If packet_traits::HasReciprocal then the Estimated cost is that + // of computing an approximation plus a single Newton-Raphson step, which + // consists of 1 pmul + 1 pmadd. + Cost = (packet_traits::HasReciprocal + ? 4 * NumTraits::MulCost + : scalar_div_cost::value) }; }; @@ -737,7 +701,6 @@ struct functor_traits > { */ template struct scalar_square_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op) EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; } template EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const @@ -750,7 +713,6 @@ struct functor_traits > // Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. template<> struct scalar_square_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op) EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } template EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const @@ -766,7 +728,6 @@ struct functor_traits > */ template struct scalar_cube_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op) EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; } template EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const @@ -779,7 +740,6 @@ struct functor_traits > // Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. template<> struct scalar_cube_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op) EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } template EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const @@ -794,7 +754,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::round() */ template struct scalar_round_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_round_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::round(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pround(a); } @@ -813,7 +772,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::floor() */ template struct scalar_floor_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_floor_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::floor(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); } @@ -832,7 +790,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::rint() */ template struct scalar_rint_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); } @@ -851,7 +808,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::ceil() */ template struct scalar_ceil_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); } @@ -870,7 +826,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::isnan() */ template struct scalar_isnan_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op) typedef bool result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { #if defined(SYCL_DEVICE_ONLY) @@ -894,7 +849,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::isinf() */ template struct scalar_isinf_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op) typedef bool result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { #if defined(SYCL_DEVICE_ONLY) @@ -918,7 +872,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::isfinite() */ template struct scalar_isfinite_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op) typedef bool result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { #if defined(SYCL_DEVICE_ONLY) @@ -943,7 +896,6 @@ struct functor_traits > * \sa class CwiseUnaryOp, ArrayBase::operator! */ template struct scalar_boolean_not_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_not_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; } }; template @@ -958,47 +910,19 @@ struct functor_traits > { * \brief Template functor to compute the signum of a scalar * \sa class CwiseUnaryOp, Cwise::sign() */ -template::IsComplex!=0), bool is_integer=(NumTraits::IsInteger!=0) > struct scalar_sign_op; template -struct scalar_sign_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) +struct scalar_sign_op { EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { - return Scalar( (a>Scalar(0)) - (a + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { + return internal::psign(a); } - //TODO - //template - //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } }; -template -struct scalar_sign_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const - { - return (numext::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a - //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } -}; - -template -struct scalar_sign_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const - { - typedef typename NumTraits::Real real_type; - real_type aa = numext::abs(a); - if (aa==real_type(0)) - return Scalar(0); - aa = real_type(1)/aa; - return Scalar(a.real()*aa, a.imag()*aa ); - } - //TODO - //template - //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } -}; template struct functor_traits > { enum { @@ -1006,7 +930,7 @@ struct functor_traits > NumTraits::IsComplex ? ( 8*NumTraits::MulCost ) // roughly : ( 3*NumTraits::AddCost), - PacketAccess = packet_traits::HasSign + PacketAccess = packet_traits::HasSign && packet_traits::Vectorizable }; }; @@ -1016,7 +940,6 @@ struct functor_traits > */ template struct scalar_logistic_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return packetOp(x); } @@ -1024,87 +947,104 @@ struct scalar_logistic_op { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { const Packet one = pset1(T(1)); - return pdiv(one, padd(one, pexp(pnegate(x)))); + const Packet inf = pset1(NumTraits::infinity()); + const Packet e = pexp(x); + const Packet inf_mask = pcmp_eq(e, inf); + return pselect(inf_mask, one, pdiv(e, padd(one, e))); } }; -#ifndef EIGEN_GPU_COMPILE_PHASE +// TODO(rmlarsen): Enable the following on host when integer_packet is defined +// for the relevant packet types. +#ifdef EIGEN_GPU_CC + /** \internal * \brief Template specialization of the logistic function for float. - * - * Uses just a 9/10-degree rational interpolant which - * interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range - * [-9, 18]. Below -9 we use the more accurate approximation - * 1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing - * one ulp. The shifted logistic is interpolated because it was easier to - * make the fit converge. - * + * Computes S(x) = exp(x) / (1 + exp(x)), where exp(x) is implemented + * using an algorithm partly adopted from the implementation of + * pexp_float. See the individual steps described in the code below. + * Note that compared to pexp, we use an additional outer multiplicative + * range reduction step using the identity exp(x) = exp(x/2)^2. + * This prevert us from having to call ldexp on values that could produce + * a denormal result, which allows us to call the faster implementation in + * pldexp_fast_impl::run(p, m). + * The final squaring, however, doubles the error bound on the final + * approximation. Exhaustive testing shows that we have a worst case error + * of 4.5 ulps (compared to computing S(x) in double precision), which is + * acceptable. */ template <> struct scalar_logistic_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { - return packetOp(x); + // Truncate at the first point where the interpolant is exactly one. + const float cst_exp_hi = 16.6355324f; + const float e = numext::exp(numext::mini(x, cst_exp_hi)); + return e / (1.0f + e); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& _x) const { - const Packet cutoff_lower = pset1(-9.f); - const Packet lt_mask = pcmp_lt(_x, cutoff_lower); - const bool any_small = predux_any(lt_mask); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet + packetOp(const Packet& _x) const { + const Packet cst_zero = pset1(0.0f); + const Packet cst_one = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + // Truncate at the first point where the interpolant is exactly one. + const Packet cst_exp_hi = pset1(16.6355324f); + const Packet cst_exp_lo = pset1(-104.f); - // The upper cut-off is the smallest x for which the rational approximation evaluates to 1. - // Choosing this value saves us a few instructions clamping the results at the end. -#ifdef EIGEN_VECTORIZE_FMA - const Packet cutoff_upper = pset1(15.7243833541870117f); -#else - const Packet cutoff_upper = pset1(15.6437711715698242f); -#endif - const Packet x = pmin(_x, cutoff_upper); + // Clamp x to the non-trivial range where S(x). Outside this + // interval the correctly rounded value of S(x) is either zero + // or one. + Packet zero_mask = pcmp_lt(_x, cst_exp_lo); + Packet x = pmin(_x, cst_exp_hi); - // The monomial coefficients of the numerator polynomial (odd). - const Packet alpha_1 = pset1(2.48287947061529e-01f); - const Packet alpha_3 = pset1(8.51377133304701e-03f); - const Packet alpha_5 = pset1(6.08574864600143e-05f); - const Packet alpha_7 = pset1(1.15627324459942e-07f); - const Packet alpha_9 = pset1(4.37031012579801e-11f); + // 1. Multiplicative range reduction: + // Reduce the range of x by a factor of 2. This avoids having + // to compute exp(x) accurately where the result is a denormalized + // value. + x = pmul(x, cst_half); - // The monomial coefficients of the denominator polynomial (even). - const Packet beta_0 = pset1(9.93151921023180e-01f); - const Packet beta_2 = pset1(1.16817656904453e-01f); - const Packet beta_4 = pset1(1.70198817374094e-03f); - const Packet beta_6 = pset1(6.29106785017040e-06f); - const Packet beta_8 = pset1(5.76102136993427e-09f); - const Packet beta_10 = pset1(6.10247389755681e-13f); + // 2. Subtractive range reduction: + // Express exp(x) as exp(m*ln(2) + r) = 2^m*exp(r), start by extracting + // m = floor(x/ln(2) + 0.5), such that x = m*ln(2) + r. + const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); + Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half)); + // Get r = x - m*ln(2). We use a trick from Cephes where the term + // m*ln(2) is subtracted out in two parts, m*C1+m*C2 = m*ln(2), + // to avoid accumulating truncation errors. + const Packet cst_cephes_exp_C1 = pset1(-0.693359375f); + const Packet cst_cephes_exp_C2 = pset1(2.12194440e-4f); + Packet r = pmadd(m, cst_cephes_exp_C1, x); + r = pmadd(m, cst_cephes_exp_C2, r); - // Since the polynomials are odd/even, we need x^2. - const Packet x2 = pmul(x, x); + // 3. Compute an approximation to exp(r) using a degree 5 minimax polynomial. + // We compute even and odd terms separately to increase instruction level + // parallelism. + Packet r2 = pmul(r, r); + const Packet cst_p2 = pset1(0.49999141693115234375f); + const Packet cst_p3 = pset1(0.16666877269744873046875f); + const Packet cst_p4 = pset1(4.1898667812347412109375e-2f); + const Packet cst_p5 = pset1(8.33471305668354034423828125e-3f); - // Evaluate the numerator polynomial p. - Packet p = pmadd(x2, alpha_9, alpha_7); - p = pmadd(x2, p, alpha_5); - p = pmadd(x2, p, alpha_3); - p = pmadd(x2, p, alpha_1); - p = pmul(x, p); + const Packet p_even = pmadd(r2, cst_p4, cst_p2); + const Packet p_odd = pmadd(r2, cst_p5, cst_p3); + const Packet p_low = padd(r, cst_one); + Packet p = pmadd(r, p_odd, p_even); + p = pmadd(r2, p, p_low); - // Evaluate the denominator polynomial q. - Packet q = pmadd(x2, beta_10, beta_8); - q = pmadd(x2, q, beta_6); - q = pmadd(x2, q, beta_4); - q = pmadd(x2, q, beta_2); - q = pmadd(x2, q, beta_0); - // Divide the numerator by the denominator and shift it up. - const Packet logistic = padd(pdiv(p, q), pset1(0.5f)); - if (EIGEN_PREDICT_FALSE(any_small)) { - const Packet exponential = pexp(_x); - return pselect(lt_mask, exponential, logistic); - } else { - return logistic; - } + // 4. Undo subtractive range reduction exp(m*ln(2) + r) = 2^m * exp(r). + Packet e = pldexp_fast_impl::run(p, m); + + // 5. Undo multiplicative range reduction by using exp(r) = exp(r/2)^2. + e = pmul(e, e); + + // Return exp(x) / (1 + exp(x)) + return pselect(zero_mask, cst_zero, pdiv(e, padd(cst_one, e))); } }; #endif // #ifndef EIGEN_GPU_COMPILE_PHASE + template struct functor_traits > { enum { @@ -1124,6 +1064,97 @@ struct functor_traits > { }; }; +template ::IsInteger, + bool IsExponentInteger = NumTraits::IsInteger, + bool IsBaseComplex = NumTraits::IsComplex, + bool IsExponentComplex = NumTraits::IsComplex> +struct scalar_unary_pow_op { + typedef typename internal::promote_scalar_arg< + Scalar, ExponentScalar, + internal::has_ReturnType >::value>::type PromotedExponent; + typedef typename ScalarBinaryOpTraits::ReturnType result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const { + EIGEN_USING_STD(pow); + return static_cast(pow(a, m_exponent)); + } + + private: + const ExponentScalar m_exponent; + scalar_unary_pow_op() {} +}; + +template +constexpr int exponent_digits() { + return CHAR_BIT * sizeof(T) - NumTraits::digits() - NumTraits::IsSigned; +} + +template +struct is_floating_exactly_representable { + // TODO(rmlarsen): Add radix to NumTraits and enable this check. + // (NumTraits::radix == NumTraits::radix) && + static constexpr bool value = (exponent_digits() >= exponent_digits() && + NumTraits::digits() >= NumTraits::digits()); +}; + + +// Specialization for real, non-integer types, non-complex types. +template +struct scalar_unary_pow_op { + template ::value> + std::enable_if_t check_is_representable() const {} + + // Issue a deprecation warning if we do a narrowing conversion on the exponent. + template ::value> + EIGEN_DEPRECATED std::enable_if_t check_is_representable() const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(static_cast(exponent)) { + check_is_representable(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { + EIGEN_USING_STD(pow); + return static_cast(pow(a, m_exponent)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { + return unary_pow_impl::run(a, m_exponent); + } + + private: + const Scalar m_exponent; + scalar_unary_pow_op() {} +}; + +template +struct scalar_unary_pow_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {} + // TODO: error handling logic for complex^real_integer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { + return unary_pow_impl::run(a, m_exponent); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { + return unary_pow_impl::run(a, m_exponent); + } + + private: + const ExponentScalar m_exponent; + scalar_unary_pow_op() {} +}; + +template +struct functor_traits> { + enum { + GenPacketAccess = functor_traits>::PacketAccess, + IntPacketAccess = !NumTraits::IsComplex && packet_traits::HasMul && (packet_traits::HasDiv || NumTraits::IsInteger) && packet_traits::HasCmp, + PacketAccess = NumTraits::IsInteger ? IntPacketAccess : (IntPacketAccess && GenPacketAccess), + Cost = functor_traits>::Cost + }; +}; + } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/libs/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h index f35b760..4a6cef5 100644 --- a/libs/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/libs/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -11,6 +11,8 @@ #define EIGEN_GENERAL_BLOCK_PANEL_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -21,7 +23,7 @@ enum GEBPPacketSizeType { GEBPPacketQuarter }; -template +template class gebp_traits; @@ -55,8 +57,13 @@ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*10 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024); #elif EIGEN_ARCH_PPC const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024); +#ifdef _ARCH_PWR10 +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2*1024*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8*1024*1024); +#else const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024); +#endif #else const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024); const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); @@ -352,9 +359,9 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ template struct RhsPanelHelper { private: - static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + static constexpr int remaining_registers = (std::max)(int(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) - registers_taken, 0); public: - typedef typename conditional=4, RhsPacketx4, RhsPacket>::type type; + typedef std::conditional_t=4, RhsPacketx4, RhsPacket> type; }; template @@ -376,12 +383,12 @@ struct packet_conditional { typedef T1 type; }; template struct packet_conditional { typedef T2 type; }; -#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ typedef typename packet_conditional::type, \ typename packet_traits::half, \ typename unpacket_traits::half>::half>::type \ - prefix ## name ## Packet + name ## Packet ## postfix #define PACKET_DECL_COND(name, packet_size) \ typedef typename packet_conditional { typedef T2 type; }; typename unpacket_traits::half>::half>::type \ name ## Packet -#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ +#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \ typedef typename packet_conditional::type, \ typename packet_traits::half, \ typename unpacket_traits::half>::half>::type \ - prefix ## ScalarPacket + ScalarPacket ## postfix #define PACKET_DECL_COND_SCALAR(packet_size) \ typedef typename packet_conditional { typedef T2 type; }; * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: - typedef _LhsScalar LhsScalar; - typedef _RhsScalar RhsScalar; + typedef LhsScalar_ LhsScalar; + typedef RhsScalar_ RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); enum { - ConjLhs = _ConjLhs, - ConjRhs = _ConjRhs, - Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + ConjLhs = ConjLhs_, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -440,7 +447,7 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) - default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, + default_mr = (plain_enum_min(16, NumberOfRegisters)/2/nr)*LhsPacketSize, #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) // we assume 16 registers or more @@ -457,9 +464,9 @@ public: }; - typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; - typedef typename conditional::type ResPacket; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t RhsPacket; + typedef std::conditional_t ResPacket; typedef LhsPacket LhsPacket4Packing; typedef QuadPacket RhsPacketx4; @@ -543,25 +550,25 @@ public: }; -template -class gebp_traits, RealScalar, _ConjLhs, false, Arch, _PacketSize> +template +class gebp_traits, RealScalar, ConjLhs_, false, Arch, PacketSize_> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); enum { - ConjLhs = _ConjLhs, + ConjLhs = ConjLhs_, ConjRhs = false, - Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -569,16 +576,16 @@ public: // we assume 16 registers mr = 3*LhsPacketSize, #else - mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, + mr = (plain_enum_min(16, NumberOfRegisters)/2/nr)*LhsPacketSize, #endif LhsProgress = LhsPacketSize, RhsProgress = 1 }; - typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; - typedef typename conditional::type ResPacket; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t RhsPacket; + typedef std::conditional_t ResPacket; typedef LhsPacket LhsPacket4Packing; typedef QuadPacket RhsPacketx4; @@ -612,7 +619,7 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - loadRhsQuad_impl(b,dest, typename conditional::type()); + loadRhsQuad_impl(b,dest, std::conditional_t()); } EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const @@ -643,7 +650,7 @@ public: template EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { - madd_impl(a, b, c, tmp, typename conditional::type()); + madd_impl(a, b, c, tmp, std::conditional_t()); } template @@ -701,7 +708,7 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket const DoublePacket& predux_half_dowto4(const DoublePacket &a, - typename enable_if::size<=8>::type* = 0) + std::enable_if_t::size<=8>* = 0) { return a; } @@ -709,7 +716,7 @@ predux_half_dowto4(const DoublePacket &a, template DoublePacket::half> predux_half_dowto4(const DoublePacket &a, - typename enable_if::size==16>::type* = 0) + std::enable_if_t::size==16>* = 0) { // yes, that's pretty hackish :( DoublePacket::half> res; @@ -723,7 +730,7 @@ predux_half_dowto4(const DoublePacket &a, // same here, "quad" actually means "8" in terms of real coefficients template void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, - typename enable_if::size<=8>::type* = 0) + std::enable_if_t::size<=8>* = 0) { dest.first = pset1(numext::real(*b)); dest.second = pset1(numext::imag(*b)); @@ -731,7 +738,7 @@ void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, template void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, - typename enable_if::size==16>::type* = 0) + std::enable_if_t::size==16>* = 0) { // yes, that's pretty hackish too :( typedef typename NumTraits::Real RealScalar; @@ -744,6 +751,9 @@ void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, template struct unpacket_traits > { typedef DoublePacket::half> half; + enum{ + size = 2 * unpacket_traits::size + }; }; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) @@ -754,8 +764,8 @@ template struct unpacket_traits > { // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs, Arch, _PacketSize > +template +class gebp_traits, std::complex, ConjLhs_, ConjRhs_, Arch, PacketSize_ > { public: typedef std::complex Scalar; @@ -763,19 +773,19 @@ public: typedef std::complex RhsScalar; typedef std::complex ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); - PACKET_DECL_COND(Real, _PacketSize); - PACKET_DECL_COND_SCALAR(_PacketSize); + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + PACKET_DECL_COND(Real, PacketSize_); + PACKET_DECL_COND_SCALAR(PacketSize_); enum { - ConjLhs = _ConjLhs, - ConjRhs = _ConjRhs, + ConjLhs = ConjLhs_, + ConjRhs = ConjRhs_, Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, RealPacketSize = Vectorizable ? unpacket_traits::size : 1, @@ -789,13 +799,13 @@ public: typedef DoublePacket DoublePacketType; - typedef typename conditional::type LhsPacket4Packing; - typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; - typedef typename conditional::type ResPacket; - typedef typename conditional::type AccPacket; + typedef std::conditional_t LhsPacket4Packing; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t RhsPacket; + typedef std::conditional_t ResPacket; + typedef std::conditional_t AccPacket; - // this actualy holds 8 packets! + // this actually holds 8 packets! typedef QuadPacket RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -866,7 +876,7 @@ public: template EIGEN_STRONG_INLINE - typename enable_if::value>::type + std::enable_if_t::value> madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); @@ -920,8 +930,8 @@ protected: conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs, Arch, _PacketSize > +template +class gebp_traits, false, ConjRhs_, Arch, PacketSize_ > { public: typedef std::complex Scalar; @@ -929,38 +939,38 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); - PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_); + PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_); -#undef PACKET_DECL_COND_SCALAR_PREFIX -#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR_POSTFIX +#undef PACKET_DECL_COND_POSTFIX #undef PACKET_DECL_COND_SCALAR #undef PACKET_DECL_COND enum { ConjLhs = false, - ConjRhs = _ConjRhs, - Vectorizable = unpacket_traits<_RealPacket>::vectorizable - && unpacket_traits<_ScalarPacket>::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable + && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters nr = 4, - mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, + mr = (plain_enum_min(16, NumberOfRegisters)/2/nr)*ResPacketSize, LhsProgress = ResPacketSize, RhsProgress = 1 }; - typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; - typedef typename conditional::type ResPacket; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t RhsPacket; + typedef std::conditional_t ResPacket; typedef LhsPacket LhsPacket4Packing; typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; @@ -1009,7 +1019,7 @@ public: template EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { - madd_impl(a, b, c, tmp, typename conditional::type()); + madd_impl(a, b, c, tmp, std::conditional_t()); } template @@ -1068,6 +1078,7 @@ struct gebp_kernel typedef typename Traits::RhsPacketx4 RhsPacketx4; typedef typename RhsPanelHelper::type RhsPanel15; + typedef typename RhsPanelHelper::type RhsPanel27; typedef gebp_traits SwappedTraits; @@ -1201,7 +1212,7 @@ struct lhs_process_one_packet traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); - #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC) __asm__ ("" : "+x,m" (*A0)); #endif EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); @@ -1213,13 +1224,140 @@ struct lhs_process_one_packet int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) { GEBPTraits traits; - + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; // loops on each largest micro horizontal panel of lhs // (LhsProgress x depth) for(Index i=peelStart; i=8) { + for(Index j2=0; j2); \ + traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C4, T0, fix<0>); \ + traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C5, T0, fix<1>); \ + traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C6, T0, fix<2>); \ + traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C7, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \ + } while (false) + + EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8"); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*8*RhsProgress; + blA += pk*(1*LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8"); + } + // process remaining peeled loop + for(Index k=peeled_kc; k(alpha); + + R0 = r0.template loadPacket(0); + R1 = r1.template loadPacket(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket(0); + R1 = r3.template loadPacket(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + + R0 = r4.template loadPacket(0); + R1 = r5.template loadPacket(0); + traits.acc(C4, alphav, R0); + traits.acc(C5, alphav, R1); + r4.storePacket(0, R0); + r5.storePacket(0, R1); + + R0 = r6.template loadPacket(0); + R1 = r7.template loadPacket(0); + traits.acc(C6, alphav, R0); + traits.acc(C7, alphav, R1); + r6.storePacket(0, R0); + r7.storePacket(0, R1); + } + } +#endif + // loops on each largest micro vertical panel of rhs (depth * nr) - for(Index j2=0; j2 cj; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; @@ -1441,7 +1580,220 @@ void gebp_kernel=8) { + for(Index j2=0; j2); \ + traits.madd(A1, rhs_panel, C8, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C16, T0, fix<0>); \ + traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C9, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C17, T0, fix<1>); \ + traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C10, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C18, T0, fix<2>); \ + traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C11, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C19, T0, fix<3>); \ + traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C12, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C20, T0, fix<0>); \ + traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C13, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C21, T0, fix<1>); \ + traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C14, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C22, T0, fix<2>); \ + traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C15, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C23, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \ + } while (false) + + EIGEN_GEBP_ONESTEP(0); + EIGEN_GEBP_ONESTEP(1); + EIGEN_GEBP_ONESTEP(2); + EIGEN_GEBP_ONESTEP(3); + EIGEN_GEBP_ONESTEP(4); + EIGEN_GEBP_ONESTEP(5); + EIGEN_GEBP_ONESTEP(6); + EIGEN_GEBP_ONESTEP(7); + + blB += pk * 8 * RhsProgress; + blA += pk * 3 * Traits::LhsProgress; + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8"); + } + + // process remaining peeled loop + for (Index k = peeled_kc; k < depth; k++) + { + + RhsPanel27 rhs_panel; + RhsPacket T0; + LhsPacket A2; + EIGEN_GEBP_ONESTEP(0); + blB += 8 * RhsProgress; + blA += 3 * Traits::LhsProgress; + } + + #undef EIGEN_GEBP_ONESTEP + + ResPacket R0, R1, R2; + ResPacket alphav = pset1(alpha); + + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C8, alphav, R1); + traits.acc(C16, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.template loadPacket(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C1, alphav, R0); + traits.acc(C9, alphav, R1); + traits.acc(C17, alphav, R2); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.template loadPacket(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C10, alphav, R1); + traits.acc(C18, alphav, R2); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.template loadPacket(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C3, alphav, R0); + traits.acc(C11, alphav, R1); + traits.acc(C19, alphav, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r4.template loadPacket(0 * Traits::ResPacketSize); + R1 = r4.template loadPacket(1 * Traits::ResPacketSize); + R2 = r4.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C4, alphav, R0); + traits.acc(C12, alphav, R1); + traits.acc(C20, alphav, R2); + r4.storePacket(0 * Traits::ResPacketSize, R0); + r4.storePacket(1 * Traits::ResPacketSize, R1); + r4.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r5.template loadPacket(0 * Traits::ResPacketSize); + R1 = r5.template loadPacket(1 * Traits::ResPacketSize); + R2 = r5.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C5, alphav, R0); + traits.acc(C13, alphav, R1); + traits.acc(C21, alphav, R2); + r5.storePacket(0 * Traits::ResPacketSize, R0); + r5.storePacket(1 * Traits::ResPacketSize, R1); + r5.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r6.template loadPacket(0 * Traits::ResPacketSize); + R1 = r6.template loadPacket(1 * Traits::ResPacketSize); + R2 = r6.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C6, alphav, R0); + traits.acc(C14, alphav, R1); + traits.acc(C22, alphav, R2); + r6.storePacket(0 * Traits::ResPacketSize, R0); + r6.storePacket(1 * Traits::ResPacketSize, R1); + r6.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r7.template loadPacket(0 * Traits::ResPacketSize); + R1 = r7.template loadPacket(1 * Traits::ResPacketSize); + R2 = r7.template loadPacket(2 * Traits::ResPacketSize); + traits.acc(C7, alphav, R0); + traits.acc(C15, alphav, R1); + traits.acc(C23, alphav, R2); + r7.storePacket(0 * Traits::ResPacketSize, R0); + r7.storePacket(1 * Traits::ResPacketSize, R1); + r7.storePacket(2 * Traits::ResPacketSize, R2); + } + } + } +#endif + for(Index j2=packet_cols8; j2=8) { + for(Index j2=0; j2=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); + #else + #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND + #endif +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \ + traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ + traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C11, T0, fix<3>); \ + traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C12, T0, fix<0>); \ + traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C13, T0, fix<1>); \ + traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C14, T0, fix<2>); \ + traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C15, T0, fix<3>); \ + EIGEN_GEBP_2Px8_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \ + } while (false) + + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8"); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*8*RhsProgress; + blA += pk*(2*Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8"); + } + // process remaining peeled loop + for(Index k=peeled_kc; k(alpha); + + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket(1 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C8, alphav, R1); + traits.acc(C1, alphav, R2); + traits.acc(C9, alphav, R3); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.template loadPacket(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket(1 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C10, alphav, R1); + traits.acc(C3, alphav, R2); + traits.acc(C11, alphav, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r4.template loadPacket(0 * Traits::ResPacketSize); + R1 = r4.template loadPacket(1 * Traits::ResPacketSize); + R2 = r5.template loadPacket(0 * Traits::ResPacketSize); + R3 = r5.template loadPacket(1 * Traits::ResPacketSize); + traits.acc(C4, alphav, R0); + traits.acc(C12, alphav, R1); + traits.acc(C5, alphav, R2); + traits.acc(C13, alphav, R3); + r4.storePacket(0 * Traits::ResPacketSize, R0); + r4.storePacket(1 * Traits::ResPacketSize, R1); + r5.storePacket(0 * Traits::ResPacketSize, R2); + r5.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r6.template loadPacket(0 * Traits::ResPacketSize); + R1 = r6.template loadPacket(1 * Traits::ResPacketSize); + R2 = r7.template loadPacket(0 * Traits::ResPacketSize); + R3 = r7.template loadPacket(1 * Traits::ResPacketSize); + traits.acc(C6, alphav, R0); + traits.acc(C14, alphav, R1); + traits.acc(C7, alphav, R2); + traits.acc(C15, alphav, R3); + r6.storePacket(0 * Traits::ResPacketSize, R0); + r6.storePacket(1 * Traits::ResPacketSize, R1); + r7.storePacket(0 * Traits::ResPacketSize, R2); + r7.storePacket(1 * Traits::ResPacketSize, R3); + } + } + } +#endif + for(Index j2=packet_cols8; j2=6 without FMA (bug 1637) - #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC) #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND @@ -1904,22 +2422,84 @@ void gebp_kernel=8) { // loop on each panel of the rhs - for(Index j2=0; j2::half>::size; const int SResPacketQuarterSize = unpacket_traits::half>::half>::size; - if ((SwappedTraits::LhsProgress % 4) == 0 && + // The following code assumes we can load SRhsPacket in such a way that + // it multiplies blocks of 4 elements in SLhsPacket. This is not the + // case for some customized kernels (i.e. NEON fp16). If the assumption + // fails, drop down to the scalar path. + constexpr bool kCanLoadSRhsQuad = (unpacket_traits::size < 4) || (unpacket_traits::size % (unpacket_traits::size / 4)) == 0; + if (kCanLoadSRhsQuad && + (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress<=16) && (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) @@ -1974,10 +2554,10 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; + typedef std::conditional_t=8,typename unpacket_traits::half,SResPacket> SResPacketHalf; + typedef std::conditional_t=8,typename unpacket_traits::half,SLhsPacket> SLhsPacketHalf; + typedef std::conditional_t=8,typename unpacket_traits::half,SRhsPacket> SRhsPacketHalf; + typedef std::conditional_t=8,typename unpacket_traits::half,SAccPacket> SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); SResPacketHalf alphav = pset1(alpha); @@ -2269,8 +2849,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs0) { Index remaining_rows = rows-i; @@ -2290,21 +2870,21 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } else if (HasHalf && psize == HalfPacketSize) { gone_half = true; PacketBlock kernel_half; - for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_half); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); } else if (HasQuarter && psize == QuarterPacketSize) { gone_quarter = true; PacketBlock kernel_quarter; - for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_quarter); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); } } count += psize*pack; @@ -2395,53 +2975,125 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4 ? (cols/4) * 4 : 0; Index count = 0; const Index peeled_k = (depth/PacketSize)*PacketSize; -// if(nr>=8) -// { -// for(Index j2=0; j2 kernel; -// for (int p = 0; p < PacketSize; ++p) { -// kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); -// } -// ptranspose(kernel); -// for (int p = 0; p < PacketSize; ++p) { -// pstoreu(blockB+count, cj.pconj(kernel.packet[p])); -// count+=PacketSize; -// } -// } -// } -// for(; k=4) +#if EIGEN_ARCH_ARM64 + EIGEN_IF_CONSTEXPR(nr>=8) + { + for(Index j2=0; j2 kernel0, kernel1, kernel2, kernel3; + kernel0.packet[0%PacketSize] = dm0.template loadPacket(k); + kernel0.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel1.packet[0%PacketSize] = dm2.template loadPacket(k); + kernel1.packet[1%PacketSize] = dm3.template loadPacket(k); + kernel2.packet[0%PacketSize] = dm4.template loadPacket(k); + kernel2.packet[1%PacketSize] = dm5.template loadPacket(k); + kernel3.packet[0%PacketSize] = dm6.template loadPacket(k); + kernel3.packet[1%PacketSize] = dm7.template loadPacket(k); + ptranspose(kernel0); + ptranspose(kernel1); + ptranspose(kernel2); + ptranspose(kernel3); + + pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize])); + pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize])); + pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize])); + pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize])); + + pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize])); + pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize])); + pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize])); + pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize])); + count+=8*PacketSize; + } + else if (PacketSize == 4) + { + PacketBlock kernel0, kernel1; + + kernel0.packet[0%PacketSize] = dm0.template loadPacket(k); + kernel0.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel0.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel0.packet[3%PacketSize] = dm3.template loadPacket(k); + kernel1.packet[0%PacketSize] = dm4.template loadPacket(k); + kernel1.packet[1%PacketSize] = dm5.template loadPacket(k); + kernel1.packet[2%PacketSize] = dm6.template loadPacket(k); + kernel1.packet[3%PacketSize] = dm7.template loadPacket(k); + ptranspose(kernel0); + ptranspose(kernel1); + + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel0.packet[0%PacketSize])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel1.packet[0%PacketSize])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel0.packet[1%PacketSize])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel1.packet[1%PacketSize])); + pstoreu(blockB+count+4*PacketSize, cj.pconj(kernel0.packet[2%PacketSize])); + pstoreu(blockB+count+5*PacketSize, cj.pconj(kernel1.packet[2%PacketSize])); + pstoreu(blockB+count+6*PacketSize, cj.pconj(kernel0.packet[3%PacketSize])); + pstoreu(blockB+count+7*PacketSize, cj.pconj(kernel1.packet[3%PacketSize])); + count+=8*PacketSize; + } + else if (PacketSize == 8) + { + PacketBlock kernel0; + + kernel0.packet[0%PacketSize] = dm0.template loadPacket(k); + kernel0.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel0.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel0.packet[3%PacketSize] = dm3.template loadPacket(k); + kernel0.packet[4%PacketSize] = dm4.template loadPacket(k); + kernel0.packet[5%PacketSize] = dm5.template loadPacket(k); + kernel0.packet[6%PacketSize] = dm6.template loadPacket(k); + kernel0.packet[7%PacketSize] = dm7.template loadPacket(k); + ptranspose(kernel0); + + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel0.packet[0%PacketSize])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel0.packet[1%PacketSize])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel0.packet[2%PacketSize])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel0.packet[3%PacketSize])); + pstoreu(blockB+count+4*PacketSize, cj.pconj(kernel0.packet[4%PacketSize])); + pstoreu(blockB+count+5*PacketSize, cj.pconj(kernel0.packet[5%PacketSize])); + pstoreu(blockB+count+6*PacketSize, cj.pconj(kernel0.packet[6%PacketSize])); + pstoreu(blockB+count+7*PacketSize, cj.pconj(kernel0.packet[7%PacketSize])); + count+=8*PacketSize; + } + } + } + + for(; k=4) { for(Index j2=packet_cols8; j2=4 ? (cols/4) * 4 : 0; Index count = 0; - // if(nr>=8) - // { - // for(Index j2=0; j2(&rhs[k*rhsStride + j2]); - // pstoreu(blockB+count, cj.pconj(A)); - // } else if (PacketSize==4) { - // Packet A = ploadu(&rhs[k*rhsStride + j2]); - // Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); - // pstoreu(blockB+count, cj.pconj(A)); - // pstoreu(blockB+count+PacketSize, cj.pconj(B)); - // } else { - // const Scalar* b0 = &rhs[k*rhsStride + j2]; - // blockB[count+0] = cj(b0[0]); - // blockB[count+1] = cj(b0[1]); - // blockB[count+2] = cj(b0[2]); - // blockB[count+3] = cj(b0[3]); - // blockB[count+4] = cj(b0[4]); - // blockB[count+5] = cj(b0[5]); - // blockB[count+6] = cj(b0[6]); - // blockB[count+7] = cj(b0[7]); - // } - // count += 8; - // } - // // skip what we have after - // if(PanelMode) count += 8 * (stride-offset-depth); - // } - // } +#if EIGEN_ARCH_ARM64 + EIGEN_IF_CONSTEXPR(nr>=8) + { + for(Index j2=0; j2(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else if (PacketSize==4) { + Packet A = rhs.template loadPacket(k, j2); + Packet B = rhs.template loadPacket(k, j2 + 4); + pstoreu(blockB+count, cj.pconj(A)); + pstoreu(blockB+count+PacketSize, cj.pconj(B)); + count += 2*PacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); + blockB[count+4] = cj(dm0(4)); + blockB[count+5] = cj(dm0(5)); + blockB[count+6] = cj(dm0(6)); + blockB[count+7] = cj(dm0(7)); + count += 8; + } + } + // skip what we have after + if(PanelMode) count += 8 * (stride-offset-depth); + } + } +#endif + if(nr>=4) { for(Index j2=packet_cols8; j2 class level3_blocking; +template class level3_blocking; /* Specialization for a row-major destination matrix => simple transposition of the product */ template< @@ -148,9 +150,6 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 for(Index i=0; i class gemm_blocking_space; -template +template class level3_blocking { - typedef _LhsScalar LhsScalar; - typedef _RhsScalar RhsScalar; + typedef LhsScalar_ LhsScalar; + typedef RhsScalar_ RhsScalar; protected: LhsScalar* m_blockA; @@ -275,20 +274,19 @@ class level3_blocking inline RhsScalar* blockB() { return m_blockB; } }; -template -class gemm_blocking_space +template +class gemm_blocking_space : public level3_blocking< - typename conditional::type, - typename conditional::type> + std::conditional_t, + std::conditional_t> { enum { Transpose = StorageOrder==RowMajor, ActualRows = Transpose ? MaxCols : MaxRows, ActualCols = Transpose ? MaxRows : MaxCols }; - typedef typename conditional::type LhsScalar; - typedef typename conditional::type RhsScalar; - typedef gebp_traits Traits; + typedef std::conditional_t LhsScalar; + typedef std::conditional_t RhsScalar; enum { SizeA = ActualRows * MaxDepth, SizeB = ActualCols * MaxDepth @@ -326,18 +324,17 @@ class gemm_blocking_space -class gemm_blocking_space +template +class gemm_blocking_space : public level3_blocking< - typename conditional::type, - typename conditional::type> + std::conditional_t, + std::conditional_t> { enum { Transpose = StorageOrder==RowMajor }; - typedef typename conditional::type LhsScalar; - typedef typename conditional::type RhsScalar; - typedef gebp_traits Traits; + typedef std::conditional_t LhsScalar; + typedef std::conditional_t RhsScalar; Index m_sizeA; Index m_sizeB; @@ -416,14 +413,14 @@ struct generic_product_impl typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef typename internal::remove_all::type ActualLhsTypeCleaned; + typedef internal::remove_all_t ActualLhsTypeCleaned; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; + typedef internal::remove_all_t ActualRhsTypeCleaned; enum { - MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + MaxDepthAtCompileTime = min_size_prefer_fixed(Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) }; typedef generic_product_impl lazyproduct; @@ -486,8 +483,8 @@ struct generic_product_impl ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); } - typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); + add_const_on_value_type_t lhs = LhsBlasTraits::extract(a_lhs); + add_const_on_value_type_t rhs = RhsBlasTraits::extract(a_rhs); Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs); diff --git a/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 6ba0d9b..716f2ca 100644 --- a/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -10,6 +10,8 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H #define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H +#include "../InternalHeaderCheck.h" + namespace Eigen { template @@ -142,7 +144,7 @@ struct tribb_kernel typedef typename Traits::ResScalar ResScalar; enum { - BlockSize = meta_least_common_multiple::ret + BlockSize = meta_least_common_multiple::ret }; void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { @@ -208,17 +210,17 @@ struct general_product_to_triangular_selector { typedef typename MatrixType::Scalar Scalar; - typedef typename internal::remove_all::type Lhs; + typedef internal::remove_all_t Lhs; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs; - typedef typename internal::remove_all::type _ActualLhs; - typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(prod.lhs()); + typedef internal::remove_all_t ActualLhs_; + internal::add_const_on_value_type_t actualLhs = LhsBlasTraits::extract(prod.lhs()); - typedef typename internal::remove_all::type Rhs; + typedef internal::remove_all_t Rhs; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs; - typedef typename internal::remove_all::type _ActualRhs; - typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(prod.rhs()); + typedef internal::remove_all_t ActualRhs_; + internal::add_const_on_value_type_t actualRhs = RhsBlasTraits::extract(prod.rhs()); Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); @@ -227,19 +229,19 @@ struct general_product_to_triangular_selector enum { StorageOrder = (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, - UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1, - UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1 + UseLhsDirectly = ActualLhs_::InnerStrideAtCompileTime==1, + UseRhsDirectly = ActualRhs_::InnerStrideAtCompileTime==1 }; internal::gemv_static_vector_if static_lhs; ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(), (UseLhsDirectly ? const_cast(actualLhs.data()) : static_lhs.data())); - if(!UseLhsDirectly) Map(actualLhsPtr, actualLhs.size()) = actualLhs; + if(!UseLhsDirectly) Map(actualLhsPtr, actualLhs.size()) = actualLhs; internal::gemv_static_vector_if static_rhs; ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(), (UseRhsDirectly ? const_cast(actualRhs.data()) : static_rhs.data())); - if(!UseRhsDirectly) Map(actualRhsPtr, actualRhs.size()) = actualRhs; + if(!UseRhsDirectly) Map(actualRhsPtr, actualRhs.size()) = actualRhs; selfadjoint_rank1_update { static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta) { - typedef typename internal::remove_all::type Lhs; + typedef internal::remove_all_t Lhs; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs; - typedef typename internal::remove_all::type _ActualLhs; - typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(prod.lhs()); + typedef internal::remove_all_t ActualLhs_; + internal::add_const_on_value_type_t actualLhs = LhsBlasTraits::extract(prod.lhs()); - typedef typename internal::remove_all::type Rhs; + typedef internal::remove_all_t Rhs; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs; - typedef typename internal::remove_all::type _ActualRhs; - typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(prod.rhs()); + typedef internal::remove_all_t ActualRhs_; + internal::add_const_on_value_type_t actualRhs = RhsBlasTraits::extract(prod.rhs()); typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); @@ -273,8 +275,8 @@ struct general_product_to_triangular_selector enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, - LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, - RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0, + LhsIsRowMajor = ActualLhs_::Flags&RowMajorBit ? 1 : 0, + RhsIsRowMajor = ActualRhs_::Flags&RowMajorBit ? 1 : 0, SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0 }; @@ -284,7 +286,7 @@ struct general_product_to_triangular_selector Index depth = actualLhs.cols(); typedef internal::gemm_blocking_space BlockingType; + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, ActualRhs_::MaxColsAtCompileTime> BlockingType; BlockingType blocking(size, size, depth, 1, false); diff --git a/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 9a650ec..45ad5da 100644 --- a/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -33,6 +33,8 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H #define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h index 71abf40..490fe67 100644 --- a/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +++ b/libs/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -33,6 +33,8 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H #define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector.h index dfb6aeb..7307994 100644 --- a/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector.h @@ -10,6 +10,8 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -29,42 +31,42 @@ struct gemv_packet_cond { typedef T1 type; }; template struct gemv_packet_cond { typedef T2 type; }; -template +template class gemv_traits { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; -#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ typedef typename gemv_packet_cond::type, \ typename packet_traits::half, \ typename unpacket_traits::half>::half>::type \ - prefix ## name ## Packet + name ## Packet ## postfix - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); -#undef PACKET_DECL_COND_PREFIX + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); +#undef PACKET_DECL_COND_POSTFIX public: enum { - Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && - unpacket_traits<_RhsPacket>::vectorizable && - int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 + Vectorizable = unpacket_traits::vectorizable && + unpacket_traits::vectorizable && + int(unpacket_traits::size)==int(unpacket_traits::size), + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1 }; - typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; - typedef typename conditional::type ResPacket; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t RhsPacket; + typedef std::conditional_t ResPacket; }; /* Optimized col-major matrix * vector product: * This algorithm processes the matrix per vertical panels, - * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. + * which are then processed horizontally per chunck of 8*PacketSize x 1 vertical segments. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments diff --git a/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index 6e36c2b..f77e2e4 100644 --- a/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +++ b/libs/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -33,6 +33,8 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H #define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/products/Parallelizer.h b/libs/eigen/Eigen/src/Core/products/Parallelizer.h index 8f91879..da4affb 100644 --- a/libs/eigen/Eigen/src/Core/products/Parallelizer.h +++ b/libs/eigen/Eigen/src/Core/products/Parallelizer.h @@ -10,9 +10,7 @@ #ifndef EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H -#if EIGEN_HAS_CXX11_ATOMIC -#include -#endif +#include "../InternalHeaderCheck.h" namespace Eigen { @@ -78,18 +76,13 @@ namespace internal { template struct GemmParallelInfo { - GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} - // volatile is not enough on all architectures (see bug 1572) - // to guarantee that when thread A says to thread B that it is - // done with packing a block, then all writes have been really - // carried out... C++11 memory model+atomic guarantees this. -#if EIGEN_HAS_CXX11_ATOMIC +#ifdef EIGEN_HAS_OPENMP + GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} std::atomic sync; std::atomic users; #else - Index volatile sync; - int volatile users; + GemmParallelInfo() : lhs_start(0), lhs_length(0) {} #endif Index lhs_start; @@ -104,7 +97,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, // Without C++11, we have to disable GEMM's parallelization on // non x86 architectures because there volatile is not enough for our purpose. // See bug 1572. -#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) +#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) // FIXME the transpose variable is only needed to properly split // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole diff --git a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 33ecf10..c7bb445 100644 --- a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H #define EIGEN_SELFADJOINT_MATRIX_MATRIX_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -312,10 +314,10 @@ struct product_selfadjoint_matrix& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), - EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, - LhsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), + logical_xor(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor, + RhsSelfAdjoint, NumTraits::IsComplex && logical_xor(RhsSelfAdjoint, ConjugateRhs), + logical_xor(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, + LhsSelfAdjoint, NumTraits::IsComplex && logical_xor(LhsSelfAdjoint, ConjugateLhs), ColMajor,ResInnerStride> ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } @@ -509,8 +511,8 @@ struct selfadjoint_product_impl { eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); - typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); + add_const_on_value_type_t lhs = LhsBlasTraits::extract(a_lhs); + add_const_on_value_type_t rhs = RhsBlasTraits::extract(a_rhs); Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs); @@ -521,10 +523,10 @@ struct selfadjoint_product_impl BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); internal::product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, - NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), - EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, - NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), + internal::logical_xor(LhsIsUpper, internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, + NumTraits::IsComplex && internal::logical_xor(LhsIsUpper, bool(LhsBlasTraits::NeedToConjugate)), + internal::logical_xor(RhsIsUpper, internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, + NumTraits::IsComplex && internal::logical_xor(RhsIsUpper, bool(RhsBlasTraits::NeedToConjugate)), internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime> ::run( diff --git a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index 61396db..0e371da 100644 --- a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -33,6 +33,8 @@ #ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H #define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h index d38fd72..a62b6b5 100644 --- a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H #define EIGEN_SELFADJOINT_MATRIX_VECTOR_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -55,12 +57,12 @@ void selfadjoint_matrix_vector_product::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; - conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + conj_helper::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + conj_helper::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; conj_helper cjd; - conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; - conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; + conj_helper::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; + conj_helper::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha; @@ -167,11 +169,11 @@ struct selfadjoint_product_impl typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef typename internal::remove_all::type ActualLhsTypeCleaned; + typedef internal::remove_all_t ActualLhsTypeCleaned; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; + typedef internal::remove_all_t ActualRhsTypeCleaned; enum { LhsUpLo = LhsMode&(Upper|Lower) }; @@ -181,12 +183,12 @@ struct selfadjoint_product_impl { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; - typedef Map, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits::size)> MappedDest; + typedef Map, plain_enum_min(AlignedMax,internal::packet_traits::size)> MappedDest; eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols()); - typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); + add_const_on_value_type_t lhs = LhsBlasTraits::extract(a_lhs); + add_const_on_value_type_t rhs = RhsBlasTraits::extract(a_rhs); Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs); diff --git a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 1238345..99a8ccd 100644 --- a/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +++ b/libs/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -33,6 +33,8 @@ #ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H #define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/products/SelfadjointProduct.h b/libs/eigen/Eigen/src/Core/products/SelfadjointProduct.h index a21be80..4cbc1f7 100644 --- a/libs/eigen/Eigen/src/Core/products/SelfadjointProduct.h +++ b/libs/eigen/Eigen/src/Core/products/SelfadjointProduct.h @@ -16,6 +16,8 @@ * It corresponds to the level 3 SYRK and level 2 SYR Blas routines. **********************************************************************/ +#include "../InternalHeaderCheck.h" + namespace Eigen { @@ -26,7 +28,7 @@ struct selfadjoint_rank1_update { internal::conj_if cj; typedef Map > OtherMap; - typedef typename internal::conditional::type ConjLhsType; + typedef std::conditional_t ConjLhsType; for (Index i=0; i >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1))) @@ -55,14 +57,14 @@ struct selfadjoint_product_selector typedef typename MatrixType::Scalar Scalar; typedef internal::blas_traits OtherBlasTraits; typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; - typedef typename internal::remove_all::type _ActualOtherType; - typename internal::add_const_on_value_type::type actualOther = OtherBlasTraits::extract(other.derived()); + typedef internal::remove_all_t ActualOtherType_; + internal::add_const_on_value_type_t actualOther = OtherBlasTraits::extract(other.derived()); Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); enum { StorageOrder = (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, - UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1 + UseOtherDirectly = ActualOtherType_::InnerStrideAtCompileTime==1 }; internal::gemv_static_vector_if static_other; @@ -70,7 +72,7 @@ struct selfadjoint_product_selector (UseOtherDirectly ? const_cast(actualOther.data()) : static_other.data())); if(!UseOtherDirectly) - Map(actualOtherPtr, actualOther.size()) = actualOther; + Map(actualOtherPtr, actualOther.size()) = actualOther; selfadjoint_rank1_update::IsComplex, @@ -87,21 +89,21 @@ struct selfadjoint_product_selector typedef typename MatrixType::Scalar Scalar; typedef internal::blas_traits OtherBlasTraits; typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; - typedef typename internal::remove_all::type _ActualOtherType; - typename internal::add_const_on_value_type::type actualOther = OtherBlasTraits::extract(other.derived()); + typedef internal::remove_all_t ActualOtherType_; + internal::add_const_on_value_type_t actualOther = OtherBlasTraits::extract(other.derived()); Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, - OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0 + OtherIsRowMajor = ActualOtherType_::Flags&RowMajorBit ? 1 : 0 }; Index size = mat.cols(); Index depth = actualOther.cols(); typedef internal::gemm_blocking_space BlockingType; + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, ActualOtherType_::MaxColsAtCompileTime> BlockingType; BlockingType blocking(size, size, depth, 1, false); diff --git a/libs/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/libs/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h index f752a0b..fb199ad 100644 --- a/libs/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/libs/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SELFADJOINTRANK2UPTADE_H #define EIGEN_SELFADJOINTRANK2UPTADE_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -50,9 +52,8 @@ struct selfadjoint_rank2_update_selector } }; -template struct conj_expr_if - : conditional::Scalar>,T> > {}; +template +using conj_expr_if = std::conditional::Scalar>,T>>; } // end namespace internal @@ -63,13 +64,13 @@ EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView UBlasTraits; typedef typename UBlasTraits::DirectLinearAccessType ActualUType; - typedef typename internal::remove_all::type _ActualUType; - typename internal::add_const_on_value_type::type actualU = UBlasTraits::extract(u.derived()); + typedef internal::remove_all_t ActualUType_; + internal::add_const_on_value_type_t actualU = UBlasTraits::extract(u.derived()); typedef internal::blas_traits VBlasTraits; typedef typename VBlasTraits::DirectLinearAccessType ActualVType; - typedef typename internal::remove_all::type _ActualVType; - typename internal::add_const_on_value_type::type actualV = VBlasTraits::extract(v.derived()); + typedef internal::remove_all_t ActualVType_; + internal::add_const_on_value_type_t actualV = VBlasTraits::extract(v.derived()); // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and // vice versa, and take the complex conjugate of all coefficients and vector entries. @@ -80,8 +81,8 @@ EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView::type>::type UType; - typedef typename internal::remove_all::type>::type VType; + typedef internal::remove_all_t::type> UType; + typedef internal::remove_all_t::type> VType; internal::selfadjoint_rank2_update_selector ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); diff --git a/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h index f0c6050..770107a 100644 --- a/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H #define EIGEN_TRIANGULAR_MATRIX_MATRIX_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -18,10 +20,10 @@ namespace internal { // struct gemm_pack_lhs_triangular // { // Matrix::IsComplex && Conjugate> cj; -// const_blas_data_mapper lhs(_lhs,lhsStride); +// const_blas_data_mapper lhs(lhs_,lhsStride); // int count = 0; // const int peeled_mc = (rows/mr)*mr; // for(int i=0; i Traits; enum { - SmallPanelWidth = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + SmallPanelWidth = 2 * plain_enum_max(Traits::mr, Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; static EIGEN_DONT_INLINE void run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; @@ -110,9 +112,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros @@ -124,9 +126,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -247,15 +249,15 @@ struct product_triangular_matrix_matrix Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + SmallPanelWidth = plain_enum_max(Traits::mr, Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; static EIGEN_DONT_INLINE void run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; @@ -268,9 +270,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); @@ -283,9 +285,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -412,13 +414,13 @@ struct triangular_product_impl typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef typename internal::remove_all::type ActualLhsTypeCleaned; + typedef internal::remove_all_t ActualLhsTypeCleaned; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; - - typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); + typedef internal::remove_all_t ActualRhsTypeCleaned; + + internal::add_const_on_value_type_t lhs = LhsBlasTraits::extract(a_lhs); + internal::add_const_on_value_type_t rhs = RhsBlasTraits::extract(a_rhs); LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); @@ -451,12 +453,12 @@ struct triangular_product_impl // Apply correction if the diagonal is unit and a scalar factor was nested: if ((Mode&UnitDiag)==UnitDiag) { - if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + if (LhsIsTriangular && !numext::is_exactly_one(lhs_alpha)) { Index diagSize = (std::min)(lhs.rows(),lhs.cols()); dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); } - else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + else if ((!LhsIsTriangular) && !numext::is_exactly_one(rhs_alpha)) { Index diagSize = (std::min)(rhs.rows(),rhs.cols()); dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); diff --git a/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index a98d12e..1eb57d3 100644 --- a/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/libs/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -33,6 +33,8 @@ #ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H #define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector.h b/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector.h index 76bfa15..df15e81 100644 --- a/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H #define EIGEN_TRIANGULARMATRIXVECTOR_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -209,17 +211,16 @@ template struct trmv_selector typedef typename Lhs::Scalar LhsScalar; typedef typename Rhs::Scalar RhsScalar; typedef typename Dest::Scalar ResScalar; - typedef typename Dest::RealScalar RealScalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef Map, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits::size)> MappedDest; + typedef Map, plain_enum_min(AlignedMax,internal::packet_traits::size)> MappedDest; - typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(lhs); - typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(rhs); + add_const_on_value_type_t actualLhs = LhsBlasTraits::extract(lhs); + add_const_on_value_type_t actualRhs = RhsBlasTraits::extract(rhs); LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); @@ -235,7 +236,7 @@ template struct trmv_selector gemv_static_vector_if static_dest; - bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); + bool alphaIsCompatible = (!ComplexByReal) || numext::is_exactly_zero(numext::imag(actualAlpha)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; RhsScalar compatibleAlpha = get_factor::run(actualAlpha); @@ -276,7 +277,7 @@ template struct trmv_selector dest = MappedDest(actualDestPtr, dest.size()); } - if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + if ( ((Mode&UnitDiag)==UnitDiag) && !numext::is_exactly_one(lhs_alpha) ) { Index diagSize = (std::min)(lhs.rows(),lhs.cols()); dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); @@ -297,10 +298,10 @@ template struct trmv_selector typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; + typedef internal::remove_all_t ActualRhsTypeCleaned; - typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); - typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); + std::add_const_t actualLhs = LhsBlasTraits::extract(lhs); + std::add_const_t actualRhs = RhsBlasTraits::extract(rhs); LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); @@ -335,7 +336,7 @@ template struct trmv_selector dest.data(),dest.innerStride(), actualAlpha); - if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + if ( ((Mode&UnitDiag)==UnitDiag) && !numext::is_exactly_one(lhs_alpha) ) { Index diagSize = (std::min)(lhs.rows(),lhs.cols()); dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); diff --git a/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 3d47a2b..7a4d59e 100644 --- a/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/libs/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -33,6 +33,8 @@ #ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H #define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/libs/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h index 6d879ba..b148d9c 100644 --- a/libs/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/libs/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2009 Gael Guennebaud +// Modifications Copyright (C) 2022 Intel Corporation // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,10 +11,123 @@ #ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H #define EIGEN_TRIANGULAR_SOLVER_MATRIX_H -namespace Eigen { +#include "../InternalHeaderCheck.h" + +namespace Eigen { namespace internal { +template +struct trsmKernelL { + // Generic Implementation of triangular solve for triangular matrix on left and multiple rhs. + // Handles non-packed matrices. + static void kernel( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride); +}; + +template +struct trsmKernelR { + // Generic Implementation of triangular solve for triangular matrix on right and multiple lhs. + // Handles non-packed matrices. + static void kernel( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride); +}; + +template +EIGEN_STRONG_INLINE void trsmKernelL::kernel( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride) + { + typedef const_blas_data_mapper TriMapper; + typedef blas_data_mapper OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride, otherIncr); + + enum { IsLower = (Mode&Lower) == Lower }; + conj_if conj; + + // tr solve + for (Index k=0; k +EIGEN_STRONG_INLINE void trsmKernelR::kernel( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride) +{ + typedef typename NumTraits::Real RealScalar; + typedef blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + LhsMapper lhs(_other, otherStride, otherIncr); + RhsMapper rhs(_tri, triStride); + + enum { + RhsStorageOrder = TriStorageOrder, + IsLower = (Mode&Lower) == Lower + }; + conj_if conj; + + for (Index k=0; k struct triangular_solve_matrix @@ -44,6 +158,7 @@ struct triangular_solve_matrix& blocking); }; + template EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, @@ -53,6 +168,25 @@ EIGEN_DONT_INLINE void triangular_solve_matrix::value || + std::is_same::value)) ) { + // Very rough cutoffs to determine when to call trsm w/o packing + // For small problem sizes trsmKernel compiled with clang is generally faster. + // TODO: Investigate better heuristics for cutoffs. + double L2Cap = 0.5; // 50% of L2 size + if (size < avx512_trsm_cutoff(l2, cols, L2Cap)) { + trsmKernelL::kernel( + size, cols, _tri, triStride, _other, 1, otherStride); + return; + } + } +#endif + typedef const_blas_data_mapper TriMapper; typedef blas_data_mapper OtherMapper; TriMapper tri(_tri, triStride); @@ -61,7 +195,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + SmallPanelWidth = plain_enum_max(Traits::mr, Traits::nr), IsLower = (Mode&Lower) == Lower }; @@ -74,15 +208,12 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements - std::ptrdiff_t l1, l2, l3; - manage_caching_sizes(GetAction, &l1, &l2, &l3); Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max(otherStride,size)) : 0; subcols = std::max((subcols/Traits::nr)*Traits::nr, Traits::nr); @@ -113,38 +244,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix(actual_kc-k1, SmallPanelWidth); // tr solve - for (Index k=0; k::value || + std::is_same::value)) ) { + i = IsLower ? k2 + k1: k2 - k1 - actualPanelWidth; } +#endif + trsmKernelL::kernel( + actualPanelWidth, actual_cols, + _tri + i + (i)*triStride, triStride, + _other + i*OtherInnerStride + j2*otherStride, otherIncr, otherStride); } Index lengthTarget = actual_kc-k1-actualPanelWidth; @@ -166,7 +278,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix GEPP { Index start = IsLower ? k2+kc : 0; @@ -196,6 +308,7 @@ struct triangular_solve_matrix& blocking); }; + template EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, @@ -204,7 +317,22 @@ EIGEN_DONT_INLINE void triangular_solve_matrix& blocking) { Index rows = otherSize; - typedef typename NumTraits::Real RealScalar; + +#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_R_KERNELS && EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS + EIGEN_IF_CONSTEXPR( (OtherInnerStride == 1 && + (std::is_same::value || + std::is_same::value)) ) { + // TODO: Investigate better heuristics for cutoffs. + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + double L2Cap = 0.5; // 50% of L2 size + if (size < avx512_trsm_cutoff(l2, rows, L2Cap)) { + trsmKernelR:: + kernel(size, rows, _tri, triStride, _other, 1, otherStride); + return; + } + } +#endif typedef blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; @@ -214,7 +342,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix Traits; enum { RhsStorageOrder = TriStorageOrder, - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + SmallPanelWidth = plain_enum_max(Traits::mr, Traits::nr), IsLower = (Mode&Lower) == Lower }; @@ -227,7 +355,6 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; @@ -294,27 +421,13 @@ EIGEN_DONT_INLINE void triangular_solve_matrix:: + kernel(actualPanelWidth, actual_mc, + _tri + absolute_j2 + absolute_j2*triStride, triStride, + _other + i2*OtherInnerStride + absolute_j2*otherStride, otherIncr, otherStride); } - // pack the just computed part of lhs to A pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2), actualPanelWidth, actual_mc, @@ -329,7 +442,6 @@ EIGEN_DONT_INLINE void triangular_solve_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; - typename internal::conditional< - Conjugate, - const CwiseUnaryOp,LhsMap>, - const LhsMap&> - ::type cjLhs(lhs); + std::conditional_t< + Conjugate, + const CwiseUnaryOp,LhsMap>, + const LhsMap&> cjLhs(lhs); static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; for(Index pi=IsLower ? 0 : size; IsLower ? pi0; @@ -77,7 +78,7 @@ struct triangular_solve_vector0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) + if((!(Mode & UnitDiag)) && !is_identically_zero(rhs[i])) rhs[i] /= cjLhs(i,i); } } @@ -97,10 +98,10 @@ struct triangular_solve_vector(lhsStride)); typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - typename internal::conditional,LhsMap>, - const LhsMap& - >::type cjLhs(lhs); + std::conditional_t,LhsMap>, + const LhsMap& + > cjLhs(lhs); static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; for(Index pi=IsLower ? 0 : size; @@ -114,7 +115,7 @@ struct triangular_solve_vector(m_data + i); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index offset = 0) const { + return ploadt_partial(m_data + i, n, offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const { + return ploadt(m_data + i); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { pstoret(m_data + i, p); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index offset = 0) const { + pstoret_partial(m_data + i, p, n, offset); + } + protected: Scalar *m_data; }; @@ -187,6 +204,9 @@ public: return VectorMapper(&operator()(i, j)); } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { + internal::prefetch(&operator()(i, j)); + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { @@ -198,11 +218,26 @@ public: return ploadt(&operator()(i, j)); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index offset = 0) const { + return ploadt_partial(&operator()(i, j), n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { return ploadt(&operator()(i, j)); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType &p) const { + pstoret(&operator()(i, j), p); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index offset = 0) const { + pstoret_partial(&operator()(i, j), p, n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); @@ -214,6 +249,7 @@ public: } EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; } + EIGEN_DEVICE_FUNC const Index incr() const { return 1; } EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } EIGEN_DEVICE_FUNC Index firstAligned(Index size) const { @@ -255,11 +291,21 @@ public: return pgather(m_data + i*m_incr.value(), m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/ = 0) const { + return pgather_partial(m_data + i*m_incr.value(), m_incr.value(), n); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { pscatter(m_data + i*m_incr.value(), p, m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType &p, Index n, Index /*offset*/ = 0) const { + pscatter_partial(m_data + i*m_incr.value(), p, m_incr.value(), n); + } + protected: Scalar *m_data; const internal::variable_if_dynamic m_incr; @@ -282,6 +328,10 @@ public: return LinearMapper(&operator()(i, j), m_incr.value()); } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { + internal::prefetch(&operator()(i, j)); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride]; @@ -292,11 +342,26 @@ public: return pgather(&operator()(i, j),m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n, Index /*offset*/ = 0) const { + return pgather_partial(&operator()(i, j),m_incr.value(),n); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { return pgather(&operator()(i, j),m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType &p) const { + pscatter(&operator()(i, j), p, m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType &p, Index n, Index /*offset*/ = 0) const { + pscatter_partial(&operator()(i, j), p, m_incr.value(), n); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); @@ -308,17 +373,18 @@ public: } // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the Complex types. - template + template struct storePacketBlock_helper { - storePacketBlock_helper spbh; + storePacketBlock_helper spbh; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { spbh.store(sup, i,j,block); - for(int l = 0; l < unpacket_traits::size; l++) - { - ScalarT *v = &sup->operator()(i+l, j+idx); - *v = block.packet[idx][l]; - } + sup->template storePacket(i, j+idx, block.packet[idx]); + //for(int l = 0; l < unpacket_traits::size; l++) + //{ + // Scalar_ *v = &sup->operator()(i+l, j+idx); + // *v = *reinterpret_cast(&block.packet[idx][l]); + //} } }; @@ -328,12 +394,7 @@ public: storePacketBlock_helper, n, idx-1> spbh; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { spbh.store(sup,i,j,block); - for(int l = 0; l < unpacket_traits::size; l++) - { - std::complex *v = &sup->operator()(i+l, j+idx); - v->real(block.packet[idx].v[2*l+0]); - v->imag(block.packet[idx].v[2*l+1]); - } + sup->template storePacket(i, j+idx, block.packet[idx]); } }; @@ -352,8 +413,8 @@ public: } }; - template - struct storePacketBlock_helper + template + struct storePacketBlock_helper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { } @@ -378,6 +439,10 @@ public: storePacketBlock_helper spb; spb.store(this, i,j,block); } + + EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; } + EIGEN_DEVICE_FUNC const Index incr() const { return m_incr.value(); } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; @@ -403,7 +468,7 @@ template struct blas_traits { typedef typename traits::Scalar Scalar; typedef const XprType& ExtractType; - typedef XprType _ExtractType; + typedef XprType ExtractType_; enum { IsComplex = NumTraits::IsComplex, IsTransposed = false, @@ -414,10 +479,10 @@ template struct blas_traits ) ? 1 : 0, HasScalarFactor = false }; - typedef typename conditional::type DirectLinearAccessType; + typename ExtractType_::PlainObject + > DirectLinearAccessType; static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; @@ -498,12 +563,12 @@ struct blas_traits > typedef typename NestedXpr::Scalar Scalar; typedef blas_traits Base; typedef Transpose XprType; - typedef Transpose ExtractType; // const to get rid of a compile error; anyway blas traits are only used on the RHS - typedef Transpose _ExtractType; - typedef typename conditional ExtractType; // const to get rid of a compile error; anyway blas traits are only used on the RHS + typedef Transpose ExtractType_; + typedef std::conditional_t::type DirectLinearAccessType; + > DirectLinearAccessType; enum { IsTransposed = Base::IsTransposed ? 0 : 1 }; diff --git a/libs/eigen/Eigen/src/Core/util/ConfigureVectorization.h b/libs/eigen/Eigen/src/Core/util/ConfigureVectorization.h index af4e696..7c1a08b 100644 --- a/libs/eigen/Eigen/src/Core/util/ConfigureVectorization.h +++ b/libs/eigen/Eigen/src/Core/util/ConfigureVectorization.h @@ -30,27 +30,13 @@ * * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. - * - * FIXME: this code can be cleaned up once we switch to proper C++11 only. */ #if (defined EIGEN_CUDACC) #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) #define EIGEN_ALIGNOF(x) __alignof(x) -#elif EIGEN_HAS_ALIGNAS +#else #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n) #define EIGEN_ALIGNOF(x) alignof(x) -#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) - #define EIGEN_ALIGNOF(x) __alignof(x) -#elif EIGEN_COMP_MSVC - #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) - #define EIGEN_ALIGNOF(x) __alignof(x) -#elif EIGEN_COMP_SUNCC - // FIXME not sure about this one: - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) - #define EIGEN_ALIGNOF(x) __alignof(x) -#else - #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler #endif // If the user explicitly disable vectorization, then we also disable alignment @@ -105,18 +91,12 @@ // try to keep heap alignment even when we have to disable static alignment. #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) - // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. - // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. - // 4.8 and newer seem definitely unaffected. - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #else #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 #endif // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ - && !EIGEN_GCC3_OR_OLDER \ && !EIGEN_COMP_SUNCC \ && !EIGEN_OS_QNX #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 @@ -201,14 +181,12 @@ // removed as gcc 4.1 and msvc 2008 are not supported anyways. #if EIGEN_COMP_MSVC #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled - #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later - // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. - #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 - #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER - #endif + // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. + #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 + #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER #endif #else - #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) + #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_COMP_GNUC ) #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC #endif #endif @@ -292,6 +270,17 @@ #ifdef __AVX512BF16__ #define EIGEN_VECTORIZE_AVX512BF16 #endif + #ifdef __AVX512FP16__ + #ifdef __AVX512VL__ + #define EIGEN_VECTORIZE_AVX512FP16 + #else + #if EIGEN_COMP_GNUC + #error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported. + #else + #error Please enable AVX512-VL in your compiler flags (e.g. -mavx512vl): compiling with AVX512-FP16 alone without AVX512-VL is not supported. + #endif + #endif + #endif #endif #endif @@ -339,7 +328,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 + #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN #include #else #include @@ -438,13 +427,15 @@ #include #endif -#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) +#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380)) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C - #if defined(EIGEN_COMP_CLANG) - // Workaround for clang: The FP16C intrinsics for clang are included by - // immintrin.h, as opposed to emmintrin.h as suggested by Intel: + #if EIGEN_COMP_GNUC + // Make sure immintrin.h is included, even if e.g. vectorization is + // explicitly disabled (see also issue #2395). + // Note that FP16C intrinsics for gcc and clang are included by immintrin.h, + // as opposed to emmintrin.h as suggested by Intel: // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 #include #endif @@ -468,10 +459,14 @@ #include #define EIGEN_HAS_HIP_FP16 #include + #define EIGEN_HAS_HIP_BF16 + #include #endif /** \brief Namespace containing all symbols from the %Eigen library. */ +#include "../InternalHeaderCheck.h" + namespace Eigen { inline static const char *SimdInstructionSetsInUse(void) { diff --git a/libs/eigen/Eigen/src/Core/util/Constants.h b/libs/eigen/Eigen/src/Core/util/Constants.h index 35dcaa7..0175087 100644 --- a/libs/eigen/Eigen/src/Core/util/Constants.h +++ b/libs/eigen/Eigen/src/Core/util/Constants.h @@ -12,6 +12,8 @@ #ifndef EIGEN_CONSTANTS_H #define EIGEN_CONSTANTS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { /** This value means that a positive quantity (e.g., a size) is not known at compile-time, and that instead the value is @@ -312,7 +314,7 @@ enum SpecializedType { }; /** \ingroup enums - * Enum containing possible values for the \p _Options template parameter of + * Enum containing possible values for the \p Options_ template parameter of * Matrix, Array and BandMatrix. */ enum StorageOptions { /** Storage order is column major (see \ref TopicStorageOrders). */ @@ -421,14 +423,16 @@ enum DecompositionOptions { /** \ingroup enums * Possible values for the \p QRPreconditioner template parameter of JacobiSVD. */ enum QRPreconditioners { - /** Do not specify what is to be done if the SVD of a non-square matrix is asked for. */ - NoQRPreconditioner, - /** Use a QR decomposition without pivoting as the first step. */ - HouseholderQRPreconditioner, /** Use a QR decomposition with column pivoting as the first step. */ - ColPivHouseholderQRPreconditioner, + ColPivHouseholderQRPreconditioner = 0x0, + /** Do not specify what is to be done if the SVD of a non-square matrix is asked for. */ + NoQRPreconditioner = 0x40, + /** Use a QR decomposition without pivoting as the first step. */ + HouseholderQRPreconditioner = 0x80, /** Use a QR decomposition with full pivoting as the first step. */ - FullPivHouseholderQRPreconditioner + FullPivHouseholderQRPreconditioner = 0xC0, + /** Used to disable the QR Preconditioner in BDCSVD. */ + DisableQRDecomposition = NoQRPreconditioner }; #ifdef Success @@ -529,6 +533,7 @@ struct DenseShape { static std::string debugName() { return "DenseSh struct SolverShape { static std::string debugName() { return "SolverShape"; } }; struct HomogeneousShape { static std::string debugName() { return "HomogeneousShape"; } }; struct DiagonalShape { static std::string debugName() { return "DiagonalShape"; } }; +struct SkewSymmetricShape { static std::string debugName() { return "SkewSymmetricShape"; } }; struct BandShape { static std::string debugName() { return "BandShape"; } }; struct TriangularShape { static std::string debugName() { return "TriangularShape"; } }; struct SelfAdjointShape { static std::string debugName() { return "SelfAdjointShape"; } }; @@ -547,7 +552,7 @@ struct IteratorBased {}; /** \internal * Constants for comparison functors */ -enum ComparisonName { +enum ComparisonName : unsigned int { cmp_EQ = 0, cmp_LT = 1, cmp_LE = 2, diff --git a/libs/eigen/Eigen/src/Core/util/DisableStupidWarnings.h b/libs/eigen/Eigen/src/Core/util/DisableStupidWarnings.h old mode 100755 new mode 100644 index fe0cfec..0865fb6 --- a/libs/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/libs/eigen/Eigen/src/Core/util/DisableStupidWarnings.h @@ -1,9 +1,10 @@ #ifndef EIGEN_WARNINGS_DISABLED #define EIGEN_WARNINGS_DISABLED -#ifdef _MSC_VER +#if defined(_MSC_VER) // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable + // 4127 - conditional expression is constant // 4181 - qualifier applied to reference type ignored // 4211 - nonstandard extension used : redefined extern to static // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data @@ -19,7 +20,7 @@ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) @@ -35,25 +36,28 @@ #pragma warning disable 2196 279 1684 2259 #elif defined __clang__ - // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant - // this is really a stupid warning as it warns on compile-time expressions involving enums #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma clang diagnostic push #endif - #pragma clang diagnostic ignored "-Wconstant-logical-operand" - #if __clang_major__ >= 3 && __clang_minor__ >= 5 - #pragma clang diagnostic ignored "-Wabsolute-value" - #endif - #if __clang_major__ >= 10 - #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" - #endif - #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L - // warning: generic selections are a C11-specific feature - // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h - #pragma clang diagnostic ignored "-Wc11-extensions" + #if defined(__has_warning) + // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant + // this is really a stupid warning as it warns on compile-time expressions involving enums + #if __has_warning("-Wconstant-logical-operand") + #pragma clang diagnostic ignored "-Wconstant-logical-operand" + #endif + #if __has_warning("-Wimplicit-int-float-conversion") + #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" + #endif + #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // warning: generic selections are a C11-specific feature + // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + #if __has_warning("-Wc11-extensions") + #pragma clang diagnostic ignored "-Wc11-extensions" + #endif + #endif #endif -#elif defined __GNUC__ +#elif defined __GNUC__ && !defined(__FUJITSU) #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push @@ -74,25 +78,53 @@ #endif #if defined __NVCC__ - #pragma diag_suppress boolean_controlling_expr_is_constant + // MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so + // we instead use Microsoft's __pragma extension. + #if defined _MSC_VER + #define EIGEN_MAKE_PRAGMA(X) __pragma(#X) + #else + #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X) + #endif + #if defined __NVCC_DIAG_PRAGMA_SUPPORT__ + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X) + #else + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X) + #endif + + EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant) // Disable the "statement is unreachable" message - #pragma diag_suppress code_is_unreachable + EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable) // Disable the "dynamic initialization in unreachable code" message - #pragma diag_suppress initialization_not_reachable + EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable) // Disable the "invalid error number" message that we get with older versions of nvcc - #pragma diag_suppress 1222 + EIGEN_NV_DIAG_SUPPRESS(1222) // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler) - #pragma diag_suppress 2527 - #pragma diag_suppress 2529 - #pragma diag_suppress 2651 - #pragma diag_suppress 2653 - #pragma diag_suppress 2668 - #pragma diag_suppress 2669 - #pragma diag_suppress 2670 - #pragma diag_suppress 2671 - #pragma diag_suppress 2735 - #pragma diag_suppress 2737 - #pragma diag_suppress 2739 + EIGEN_NV_DIAG_SUPPRESS(2527) + EIGEN_NV_DIAG_SUPPRESS(2529) + EIGEN_NV_DIAG_SUPPRESS(2651) + EIGEN_NV_DIAG_SUPPRESS(2653) + EIGEN_NV_DIAG_SUPPRESS(2668) + EIGEN_NV_DIAG_SUPPRESS(2669) + EIGEN_NV_DIAG_SUPPRESS(2670) + EIGEN_NV_DIAG_SUPPRESS(2671) + EIGEN_NV_DIAG_SUPPRESS(2735) + EIGEN_NV_DIAG_SUPPRESS(2737) + EIGEN_NV_DIAG_SUPPRESS(2739) + EIGEN_NV_DIAG_SUPPRESS(2885) + EIGEN_NV_DIAG_SUPPRESS(2888) + EIGEN_NV_DIAG_SUPPRESS(2976) + EIGEN_NV_DIAG_SUPPRESS(2979) + EIGEN_NV_DIAG_SUPPRESS(20011) + EIGEN_NV_DIAG_SUPPRESS(20014) + // Disable the "// __device__ annotation is ignored on a function(...) that is + // explicitly defaulted on its first declaration" message. + // The __device__ annotation seems to actually be needed in some cases, + // otherwise resulting in kernel runtime errors. + EIGEN_NV_DIAG_SUPPRESS(2886) + EIGEN_NV_DIAG_SUPPRESS(2977) + EIGEN_NV_DIAG_SUPPRESS(20012) + #undef EIGEN_NV_DIAG_SUPPRESS + #undef EIGEN_MAKE_PRAGMA #endif #else diff --git a/libs/eigen/Eigen/src/Core/util/ForwardDeclarations.h b/libs/eigen/Eigen/src/Core/util/ForwardDeclarations.h index 2f9cc44..8f87c4a 100644 --- a/libs/eigen/Eigen/src/Core/util/ForwardDeclarations.h +++ b/libs/eigen/Eigen/src/Core/util/ForwardDeclarations.h @@ -11,6 +11,8 @@ #ifndef EIGEN_FORWARDDECLARATIONS_H #define EIGEN_FORWARDDECLARATIONS_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -49,24 +51,13 @@ template class DenseBase; template class PlainObjectBase; template class DenseCoeffsBase; -template class Matrix; template class MatrixBase; @@ -87,7 +78,6 @@ template class Transpose; template class Conjugate; template class CwiseNullaryOp; template class CwiseUnaryOp; -template class CwiseUnaryView; template class CwiseBinaryOp; template class CwiseTernaryOp; template class Solve; @@ -96,16 +86,19 @@ template class Inverse; template class Product; template class DiagonalBase; -template class DiagonalWrapper; -template class DiagonalMatrix; +template class DiagonalWrapper; +template class DiagonalMatrix; template class DiagonalProduct; template class Diagonal; +template class SkewSymmetricBase; +template class SkewSymmetricWrapper; +template class SkewSymmetricMatrix3; template class PermutationMatrix; template class Transpositions; template class PermutationBase; template class TranspositionsBase; -template class PermutationWrapper; -template class TranspositionsWrapper; +template class PermutationWrapper; +template class TranspositionsWrapper; template::has_write_access ? WriteAccessors : ReadOnlyAccessors @@ -116,7 +109,8 @@ template class OuterStride; template > class Map; template class RefBase; template,OuterStride<> >::type > class Ref; + typename StrideType = typename std::conditional_t,OuterStride<> > > class Ref; +template> class CwiseUnaryView; template class TriangularBase; template class TriangularView; @@ -142,7 +136,7 @@ template struct image_retval; } // end namespace internal namespace internal { -template class BandMatrix; +template class BandMatrix; } namespace internal { @@ -205,8 +199,12 @@ template struct scalar_cast_op; template struct scalar_random_op; template struct scalar_constant_op; template struct scalar_identity_op; -template struct scalar_sign_op; -template struct scalar_pow_op; +template struct scalar_sign_op; +template +struct scalar_pow_op; +template +struct scalar_unary_pow_op; template struct scalar_hypot_op; template struct scalar_product_op; template struct scalar_quotient_op; @@ -242,23 +240,12 @@ template struct scalar_bessel_k1e_op; struct IOFormat; // Array module -template class Array; + int MaxRows_ = Rows_, int MaxCols_ = Cols_> class Array; template class Select; template class PartialReduxExpr; template class VectorwiseOp; @@ -275,25 +262,27 @@ template class ColPivHouseholderQR; template class FullPivHouseholderQR; template class CompleteOrthogonalDecomposition; template class SVDBase; -template class JacobiSVD; -template class BDCSVD; +template class JacobiSVD; +template class BDCSVD; template class LLT; template class LDLT; template class HouseholderSequence; template class JacobiRotation; // Geometry module: -template class RotationBase; -template class Cross; +namespace internal { +template::SizeAtCompileTime> struct cross_impl; +} +template class RotationBase; template class QuaternionBase; template class Rotation2D; template class AngleAxis; template class Translation; template class AlignedBox; template class Quaternion; -template class Transform; -template class ParametrizedLine; -template class Hyperplane; +template class Transform; +template class ParametrizedLine; +template class Hyperplane; template class UniformScaling; template class Homogeneous; diff --git a/libs/eigen/Eigen/src/Core/util/IndexedViewHelper.h b/libs/eigen/Eigen/src/Core/util/IndexedViewHelper.h index f85de30..19fa45d 100644 --- a/libs/eigen/Eigen/src/Core/util/IndexedViewHelper.h +++ b/libs/eigen/Eigen/src/Core/util/IndexedViewHelper.h @@ -11,11 +11,17 @@ #ifndef EIGEN_INDEXED_VIEW_HELPER_H #define EIGEN_INDEXED_VIEW_HELPER_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { struct symbolic_last_tag {}; -} +} // namespace internal + +namespace placeholders { + +typedef symbolic::SymbolExpr last_t; /** \var last * \ingroup Core_Module @@ -28,38 +34,20 @@ struct symbolic_last_tag {}; * A typical usage example would be: * \code * using namespace Eigen; - * using Eigen::last; + * using Eigen::placeholders::last; * VectorXd v(n); * v(seq(2,last-2)).setOnes(); * \endcode * * \sa end */ -static const symbolic::SymbolExpr last; // PLEASE use Eigen::last instead of Eigen::placeholders::last +static const last_t last; -/** \var lastp1 - * \ingroup Core_Module - * - * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically - * reference the last+1 element/row/columns of the underlying vector or matrix once - * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). - * - * This symbolic placeholder supports standard arithmetic operations. - * It is essentially an alias to last+fix<1>. - * - * \sa last - */ -#ifdef EIGEN_PARSED_BY_DOXYGEN -static const auto lastp1 = last+fix<1>; -#else -// Using a FixedExpr<1> expression is important here to make sure the compiler -// can fully optimize the computation starting indices with zero overhead. -static const symbolic::AddExpr,symbolic::ValueExpr > > lastp1(last+fix<1>()); -#endif +} // namespace placeholders namespace internal { - // Replace symbolic last/end "keywords" by their true runtime value +// Replace symbolic last/end "keywords" by their true runtime value inline Index eval_expr_given_size(Index x, Index /* size */) { return x; } template @@ -68,7 +56,7 @@ FixedInt eval_expr_given_size(FixedInt x, Index /*size*/) { return x; } template Index eval_expr_given_size(const symbolic::BaseExpr &x, Index size) { - return x.derived().eval(last=size-1); + return x.derived().eval(Eigen::placeholders::last=size-1); } // Extract increment/step at compile time @@ -111,7 +99,7 @@ template<> struct get_compile_time_incr { // Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods) template -struct IndexedViewCompatibleType::value>::type> { +struct IndexedViewCompatibleType::value>> { // Here we could simply use Array, but maybe it's less work for the compiler to use // a simpler wrapper as SingleRange //typedef Eigen::Array type; @@ -119,13 +107,13 @@ struct IndexedViewCompatibleType -struct IndexedViewCompatibleType::value>::type> { +struct IndexedViewCompatibleType::value>> { typedef SingleRange type; }; template -typename enable_if::value,SingleRange>::type +std::enable_if_t::value,SingleRange> makeIndexedViewCompatible(const T& id, Index size, SpecializedType) { return eval_expr_given_size(id,size); } @@ -163,23 +151,44 @@ template struct get_compile_time_incr > { } // end namespace internal +namespace placeholders { + +typedef symbolic::AddExpr,symbolic::ValueExpr > > lastp1_t; +typedef Eigen::internal::all_t all_t; + +/** \var lastp1 + * \ingroup Core_Module + * + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically + * reference the last+1 element/row/columns of the underlying vector or matrix once + * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * This symbolic placeholder supports standard arithmetic operations. + * It is essentially an alias to last+fix<1>. + * + * \sa last + */ +#ifdef EIGEN_PARSED_BY_DOXYGEN +static const auto lastp1 = last+fix<1>; +#else +// Using a FixedExpr<1> expression is important here to make sure the compiler +// can fully optimize the computation starting indices with zero overhead. +static const lastp1_t lastp1(last+fix<1>()); +#endif + +/** \var end + * \ingroup Core_Module + * \sa lastp1 + */ +static const lastp1_t end = lastp1; /** \var all * \ingroup Core_Module * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns */ -static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all +static const Eigen::internal::all_t all; - -namespace placeholders { - typedef symbolic::SymbolExpr last_t; - typedef symbolic::AddExpr,symbolic::ValueExpr > > end_t; - typedef Eigen::internal::all_t all_t; - - EIGEN_DEPRECATED static const all_t all = Eigen::all; // PLEASE use Eigen::all instead of Eigen::placeholders::all - EIGEN_DEPRECATED static const last_t last = Eigen::last; // PLEASE use Eigen::last instead of Eigen::placeholders::last - EIGEN_DEPRECATED static const end_t end = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end -} +} // namespace placeholders } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/util/IntegralConstant.h b/libs/eigen/Eigen/src/Core/util/IntegralConstant.h index 945d426..ea275bd 100644 --- a/libs/eigen/Eigen/src/Core/util/IntegralConstant.h +++ b/libs/eigen/Eigen/src/Core/util/IntegralConstant.h @@ -11,6 +11,8 @@ #ifndef EIGEN_INTEGRAL_CONSTANT_H #define EIGEN_INTEGRAL_CONSTANT_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -29,10 +31,8 @@ template class VariableAndFixedInt; * - arithmetic and some bitwise operators: -, +, *, /, %, &, | * - c++98/14 compatibility with fix and fix() syntax to define integral constants. * - * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to - * be created by the user using Eigen::fix or Eigen::fix(). In C++98-11, the former syntax does - * not create a FixedInt instance but rather a point to function that needs to be \em cleaned-up - * using the generic helper: + * It is strongly discouraged to directly deal with this class FixedInt. Instances are expected to + * be created by the user using Eigen::fix or Eigen::fix(). * \code * internal::cleanup_index_type::type * internal::cleanup_index_type::type @@ -53,7 +53,14 @@ template class FixedInt public: static const int value = N; EIGEN_CONSTEXPR operator int() const { return value; } - FixedInt() {} + + EIGEN_CONSTEXPR + FixedInt() = default; + + EIGEN_CONSTEXPR + FixedInt(std::integral_constant) {} + + EIGEN_CONSTEXPR FixedInt( VariableAndFixedInt other) { #ifndef EIGEN_INTERNAL_DEBUGGING EIGEN_UNUSED_VARIABLE(other); @@ -61,34 +68,41 @@ public: eigen_internal_assert(int(other)==N); } + EIGEN_CONSTEXPR FixedInt<-N> operator-() const { return FixedInt<-N>(); } + template + EIGEN_CONSTEXPR FixedInt operator+( FixedInt) const { return FixedInt(); } + template + EIGEN_CONSTEXPR FixedInt operator-( FixedInt) const { return FixedInt(); } + template + EIGEN_CONSTEXPR FixedInt operator*( FixedInt) const { return FixedInt(); } + template + EIGEN_CONSTEXPR FixedInt operator/( FixedInt) const { return FixedInt(); } + template + EIGEN_CONSTEXPR FixedInt operator%( FixedInt) const { return FixedInt(); } + template + EIGEN_CONSTEXPR FixedInt operator|( FixedInt) const { return FixedInt(); } + template + EIGEN_CONSTEXPR FixedInt operator&( FixedInt) const { return FixedInt(); } -#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES // Needed in C++14 to allow fix(): - FixedInt operator() () const { return *this; } + EIGEN_CONSTEXPR FixedInt operator() () const { return *this; } VariableAndFixedInt operator() (int val) const { return VariableAndFixedInt(val); } -#else - FixedInt ( FixedInt (*)() ) {} -#endif - -#if EIGEN_HAS_CXX11 - FixedInt(std::integral_constant) {} -#endif }; /** \internal @@ -138,12 +152,6 @@ template struct get_fixed_value,Default> { static const int value = N; }; -#if !EIGEN_HAS_CXX14 -template struct get_fixed_value (*)(),Default> { - static const int value = N; -}; -#endif - template struct get_fixed_value,Default> { static const int value = N ; }; @@ -154,9 +162,6 @@ struct get_fixed_value,Default> { }; template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } -#if !EIGEN_HAS_CXX14 -template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } -#endif // Cleanup integer/FixedInt/VariableAndFixedInt/etc types: @@ -164,38 +169,21 @@ template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { r template struct cleanup_index_type { typedef T type; }; // Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index -template struct cleanup_index_type::value>::type> { typedef Index type; }; - -#if !EIGEN_HAS_CXX14 -// In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: -template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; -#endif +template struct cleanup_index_type::value>> { typedef Index type; }; // If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value: template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; // If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index): template struct cleanup_index_type, DynamicKey> { typedef Index type; }; -#if EIGEN_HAS_CXX11 template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; -#endif } // end namespace internal #ifndef EIGEN_PARSED_BY_DOXYGEN -#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES template -static const internal::FixedInt fix{}; -#else -template -inline internal::FixedInt fix() { return internal::FixedInt(); } - -// The generic typename T is mandatory. Otherwise, a code like fix could refer to either the function above or this next overload. -// This way a code like fix can only refer to the previous function. -template -inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAndFixedInt(internal::convert_index(val)); } -#endif +constexpr internal::FixedInt fix{}; #else // EIGEN_PARSED_BY_DOXYGEN @@ -221,14 +209,6 @@ inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAn * \c std::integral_constant * Here, \c fix is thus an object of type \c internal::FixedInt. * - * In c++98/11, it is implemented as a function: - * \code - * template inline internal::FixedInt fix(); - * \endcode - * Here internal::FixedInt is thus a pointer to function. - * - * If for some reason you want a true object in c++98 then you can write: \code fix() \endcode which is also valid in c++14. - * * \sa fix(int), seq, seqN */ template diff --git a/libs/eigen/Eigen/src/Core/util/MKL_support.h b/libs/eigen/Eigen/src/Core/util/MKL_support.h old mode 100755 new mode 100644 index 17963fa..9cf5f6f --- a/libs/eigen/Eigen/src/Core/util/MKL_support.h +++ b/libs/eigen/Eigen/src/Core/util/MKL_support.h @@ -120,6 +120,8 @@ #include "../../misc/blas.h" #endif +#include "../InternalHeaderCheck.h" + namespace Eigen { typedef std::complex dcomplex; diff --git a/libs/eigen/Eigen/src/Core/util/Macros.h b/libs/eigen/Eigen/src/Core/util/Macros.h index 986c3d4..4b8b277 100644 --- a/libs/eigen/Eigen/src/Core/util/Macros.h +++ b/libs/eigen/Eigen/src/Core/util/Macros.h @@ -10,6 +10,7 @@ #ifndef EIGEN_MACROS_H #define EIGEN_MACROS_H +#include "../InternalHeaderCheck.h" //------------------------------------------------------------------------------------------ // Eigen version and basic defaults @@ -17,7 +18,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 4 -#define EIGEN_MINOR_VERSION 0 +#define EIGEN_MINOR_VERSION 90 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ @@ -86,13 +87,20 @@ #define EIGEN_COMP_LLVM 0 #endif -/// \internal EIGEN_COMP_ICC set to __INTEL_COMPILER if the compiler is Intel compiler, 0 otherwise +/// \internal EIGEN_COMP_ICC set to __INTEL_COMPILER if the compiler is Intel icc compiler, 0 otherwise #if defined(__INTEL_COMPILER) #define EIGEN_COMP_ICC __INTEL_COMPILER #else #define EIGEN_COMP_ICC 0 #endif +/// \internal EIGEN_COMP_CLANGICC set to __INTEL_CLANG_COMPILER if the compiler is Intel icx compiler, 0 otherwise +#if defined(__INTEL_CLANG_COMPILER) + #define EIGEN_COMP_CLANGICC __INTEL_CLANG_COMPILER +#else + #define EIGEN_COMP_CLANGICC 0 +#endif + /// \internal EIGEN_COMP_MINGW set to 1 if the compiler is mingw #if defined(__MINGW32__) #define EIGEN_COMP_MINGW 1 @@ -128,10 +136,6 @@ // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: // name ver MSC_VER -// 2008 9 1500 -// 2010 10 1600 -// 2012 11 1700 -// 2013 12 1800 // 2015 14 1900 // "15" 15 1900 // 2017-14.1 15.0 1910 @@ -139,6 +143,9 @@ // 2017-14.12 15.5 1912 // 2017-14.13 15.6 1913 // 2017-14.14 15.7 1914 +// 2017 15.8 1915 +// 2017 15.9 1916 +// 2019 RTW 16.0 1920 /// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise. #if defined(_MSVC_LANG) @@ -193,9 +200,52 @@ #define EIGEN_COMP_EMSCRIPTEN 0 #endif +/// \internal EIGEN_COMP_FCC set to FCC version if the compiler is Fujitsu Compiler (traditional mode) +/// \note The Fujitsu C/C++ compiler uses the traditional mode based +/// on EDG g++ 6.1 by default or if envoked with the -Nnoclang flag +#if defined(__FUJITSU) + #define EIGEN_COMP_FCC (__FCC_major__*100+__FCC_minor__*10+__FCC_patchlevel__) +#else + #define EIGEN_COMP_FCC 0 +#endif + +/// \internal EIGEN_COMP_CLANGFCC set to FCC version if the compiler is Fujitsu Compiler (Clang mode) +/// \note The Fujitsu C/C++ compiler uses the non-traditional mode +/// based on Clang 7.1.0 if envoked with the -Nclang flag +#if defined(__CLANG_FUJITSU) + #define EIGEN_COMP_CLANGFCC (__FCC_major__*100+__FCC_minor__*10+__FCC_patchlevel__) +#else + #define EIGEN_COMP_CLANGFCC 0 +#endif + +/// \internal EIGEN_COMP_CPE set to CPE version if the compiler is HPE Cray Compiler (GCC based) +/// \note This is the SVE-enabled C/C++ compiler from the HPE Cray +/// Programming Environment (CPE) based on Cray GCC 8.1 +#if defined(_CRAYC) && !defined(__clang__) + #define EIGEN_COMP_CPE (_RELEASE_MAJOR*100+_RELEASE_MINOR*10+_RELEASE_PATCHLEVEL) +#else + #define EIGEN_COMP_CPE 0 +#endif + +/// \internal EIGEN_COMP_CLANGCPE set to CPE version if the compiler is HPE Cray Compiler (Clang based) +/// \note This is the C/C++ compiler from the HPE Cray Programming +/// Environment (CPE) based on Cray Clang 11.0 without SVE-support +#if defined(_CRAYC) && defined(__clang__) + #define EIGEN_COMP_CLANGCPE (_RELEASE_MAJOR*100+_RELEASE_MINOR*10+_RELEASE_PATCHLEVEL) +#else + #define EIGEN_COMP_CLANGCPE 0 +#endif + +/// \internal EIGEN_COMP_LCC set to 1 if the compiler is MCST-LCC (MCST eLbrus Compiler Collection) +#if defined(__LCC__) && defined(__MCST__) + #define EIGEN_COMP_LCC (__LCC__*100+__LCC_MINOR__) +#else + #define EIGEN_COMP_LCC 0 +#endif + /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.) -#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN) +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CLANGFCC || EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC) #define EIGEN_COMP_GNUC_STRICT 1 #else #define EIGEN_COMP_GNUC_STRICT 0 @@ -212,14 +262,6 @@ #define EIGEN_GNUC_AT(x,y) 0 #endif -// FIXME: could probably be removed as we do not support gcc 3.x anymore -#if EIGEN_COMP_GNUC && (__GNUC__ <= 3) -#define EIGEN_GCC3_OR_OLDER 1 -#else -#define EIGEN_GCC3_OR_OLDER 0 -#endif - - //------------------------------------------------------------------------------------------ // Architecture identification, EIGEN_ARCH_* @@ -575,13 +617,6 @@ // Detect Compiler/Architecture/OS specific features //------------------------------------------------------------------------------------------ -#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG - // see bug 89 - #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0 -#else - #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 -#endif - // Cross compiler wrapper around LLVM's __has_builtin #ifdef __has_builtin # define EIGEN_HAS_BUILTIN(x) __has_builtin(x) @@ -595,16 +630,6 @@ # define __has_feature(x) 0 #endif -// Some old compilers do not support template specializations like: -// template void foo(const T x[N]); -#if !( EIGEN_COMP_CLANG && ( (EIGEN_COMP_CLANG<309) \ - || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000))) \ - || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) -#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1 -#else -#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 -#endif - // The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that // works for both platforms, indicating the C++ standard version number. // @@ -622,14 +647,14 @@ #define EIGEN_CPLUSPLUS 0 #endif -// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler. +// The macro EIGEN_COMP_CXXVER defines the c++ version expected by the compiler. // For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER // is defined to 17. -#if EIGEN_CPLUSPLUS > 201703L +#if EIGEN_CPLUSPLUS >= 202002L #define EIGEN_COMP_CXXVER 20 -#elif EIGEN_CPLUSPLUS > 201402L +#elif EIGEN_CPLUSPLUS >= 201703L #define EIGEN_COMP_CXXVER 17 -#elif EIGEN_CPLUSPLUS > 201103L +#elif EIGEN_CPLUSPLUS >= 201402L #define EIGEN_COMP_CXXVER 14 #elif EIGEN_CPLUSPLUS >= 201103L #define EIGEN_COMP_CXXVER 11 @@ -637,73 +662,37 @@ #define EIGEN_COMP_CXXVER 03 #endif -#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES - #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14 - #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1 - #else - #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0 - #endif -#endif - // The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features -// but in practice we should not rely on them but rather on the availabilty of +// but in practice we should not rely on them but rather on the availability of // individual features as defined later. // This is why there is no EIGEN_HAS_CXX17. -// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11. -#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11 -#define EIGEN_HAS_CXX11 1 -#else -#define EIGEN_HAS_CXX11 0 -#endif - -#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14 -#define EIGEN_HAS_CXX14 1 -#else -#define EIGEN_HAS_CXX14 0 -#endif - -// Do we support r-value references? -#ifndef EIGEN_HAS_RVALUE_REFERENCES -#if EIGEN_MAX_CPP_VER>=11 && \ - (__has_feature(cxx_rvalue_references) || \ - (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) - #define EIGEN_HAS_RVALUE_REFERENCES 1 -#else - #define EIGEN_HAS_RVALUE_REFERENCES 0 -#endif +#if EIGEN_MAX_CPP_VER<14 || EIGEN_COMP_CXXVER<14 || (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1900) || \ + (EIGEN_COMP_ICC && EIGEN_COMP_ICC < 1500) || (EIGEN_COMP_NVCC && EIGEN_COMP_NVCC < 80000) || \ + (EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000)))) || \ + (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<51) +#error This compiler appears to be too old to be supported by Eigen #endif // Does the compiler support C99? // Need to include to make sure _GLIBCXX_USE_C99 gets defined #include #ifndef EIGEN_HAS_C99_MATH -#if EIGEN_MAX_CPP_VER>=11 && \ - ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ +#if ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \ - || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY)) + || (EIGEN_COMP_MSVC) || defined(SYCL_DEVICE_ONLY)) #define EIGEN_HAS_C99_MATH 1 #else #define EIGEN_HAS_C99_MATH 0 #endif #endif -// Does the compiler support result_of? -// result_of was deprecated in c++17 and removed in c++ 20 -#ifndef EIGEN_HAS_STD_RESULT_OF -#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17 -#define EIGEN_HAS_STD_RESULT_OF 1 -#else -#define EIGEN_HAS_STD_RESULT_OF 0 -#endif -#endif - // Does the compiler support std::hash? #ifndef EIGEN_HAS_STD_HASH // The std::hash struct is defined in C++11 but is not labelled as a __device__ // function and is not constexpr, so cannot be used on device. -#if EIGEN_HAS_CXX11 && !defined(EIGEN_GPU_COMPILE_PHASE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) #define EIGEN_HAS_STD_HASH 1 #else #define EIGEN_HAS_STD_HASH 0 @@ -718,128 +707,7 @@ #endif #endif -#ifndef EIGEN_HAS_ALIGNAS -#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \ - ( __has_feature(cxx_alignas) \ - || EIGEN_HAS_CXX14 \ - || (EIGEN_COMP_MSVC >= 1800) \ - || (EIGEN_GNUC_AT_LEAST(4,8)) \ - || (EIGEN_COMP_CLANG>=305) \ - || (EIGEN_COMP_ICC>=1500) \ - || (EIGEN_COMP_PGI>=1500) \ - || (EIGEN_COMP_SUNCC>=0x5130)) -#define EIGEN_HAS_ALIGNAS 1 -#else -#define EIGEN_HAS_ALIGNAS 0 -#endif -#endif - -// Does the compiler support type_traits? -// - full support of type traits was added only to GCC 5.1.0. -// - 20150626 corresponds to the last release of 4.x libstdc++ -#ifndef EIGEN_HAS_TYPE_TRAITS -#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \ - && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \ - && ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626) -#define EIGEN_HAS_TYPE_TRAITS 1 -#define EIGEN_INCLUDE_TYPE_TRAITS -#else -#define EIGEN_HAS_TYPE_TRAITS 0 -#endif -#endif - -// Does the compiler support variadic templates? -#ifndef EIGEN_HAS_VARIADIC_TEMPLATES -#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) ) - // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: - // this prevents nvcc from crashing when compiling Eigen on Tegra X1 -#define EIGEN_HAS_VARIADIC_TEMPLATES 1 -#elif EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) && defined(SYCL_DEVICE_ONLY) -#define EIGEN_HAS_VARIADIC_TEMPLATES 1 -#else -#define EIGEN_HAS_VARIADIC_TEMPLATES 0 -#endif -#endif - -// Does the compiler fully support const expressions? (as in c++14) -#ifndef EIGEN_HAS_CONSTEXPR - #if defined(EIGEN_CUDACC) - // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above - #if EIGEN_MAX_CPP_VER>=14 && (EIGEN_COMP_CXXVER >= 11 && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500)) - #define EIGEN_HAS_CONSTEXPR 1 - #endif - #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (EIGEN_COMP_CXXVER >= 14) || \ - (EIGEN_GNUC_AT_LEAST(4,8) && (EIGEN_COMP_CXXVER >= 11)) || \ - (EIGEN_COMP_CLANG >= 306 && (EIGEN_COMP_CXXVER >= 11))) - #define EIGEN_HAS_CONSTEXPR 1 - #endif - - #ifndef EIGEN_HAS_CONSTEXPR - #define EIGEN_HAS_CONSTEXPR 0 - #endif - -#endif // EIGEN_HAS_CONSTEXPR - -#if EIGEN_HAS_CONSTEXPR #define EIGEN_CONSTEXPR constexpr -#else -#define EIGEN_CONSTEXPR -#endif - -// Does the compiler support C++11 math? -// Let's be conservative and enable the default C++11 implementation only if we are sure it exists -#ifndef EIGEN_HAS_CXX11_MATH - #if EIGEN_MAX_CPP_VER>=11 && ((EIGEN_COMP_CXXVER > 11) || (EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ - && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)) - #define EIGEN_HAS_CXX11_MATH 1 - #else - #define EIGEN_HAS_CXX11_MATH 0 - #endif -#endif - -// Does the compiler support proper C++11 containers? -#ifndef EIGEN_HAS_CXX11_CONTAINERS - #if EIGEN_MAX_CPP_VER>=11 && \ - ((EIGEN_COMP_CXXVER > 11) \ - || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) - #define EIGEN_HAS_CXX11_CONTAINERS 1 - #else - #define EIGEN_HAS_CXX11_CONTAINERS 0 - #endif -#endif - -// Does the compiler support C++11 noexcept? -#ifndef EIGEN_HAS_CXX11_NOEXCEPT - #if EIGEN_MAX_CPP_VER>=11 && \ - (__has_feature(cxx_noexcept) \ - || (EIGEN_COMP_CXXVER > 11) \ - || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) - #define EIGEN_HAS_CXX11_NOEXCEPT 1 - #else - #define EIGEN_HAS_CXX11_NOEXCEPT 0 - #endif -#endif - -#ifndef EIGEN_HAS_CXX11_ATOMIC - #if EIGEN_MAX_CPP_VER>=11 && \ - (__has_feature(cxx_atomic) \ - || (EIGEN_COMP_CXXVER > 11) \ - || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700))) - #define EIGEN_HAS_CXX11_ATOMIC 1 - #else - #define EIGEN_HAS_CXX11_ATOMIC 0 - #endif -#endif - -#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL - #if EIGEN_MAX_CPP_VER>=11 && \ - (EIGEN_COMP_CXXVER >= 11 || EIGEN_COMP_MSVC >= 1700) - #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1 - #else - #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0 - #endif -#endif // NOTE: the required Apple's clang version is very conservative // and it could be that XCode 9 works just fine. @@ -858,7 +726,7 @@ #endif #endif -#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR +#if defined(EIGEN_CUDACC) // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules #if defined(__NVCC__) // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr @@ -918,15 +786,11 @@ #endif #endif -// EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible +// EIGEN_ALWAYS_INLINE is the strongest, it has the effect of making the function inline and adding every possible // attribute to maximize inlining. This should only be used when really necessary: in particular, // it uses __attribute__((always_inline)) on GCC, which most of the time is useless and can severely harm compile times. // FIXME with the always_inline attribute, -// gcc 3.4.x and 4.1 reports the following compilation error: -// Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval Eigen::MatrixBase::eval() const' -// : function body not available -// See also bug 1367 -#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY) +#if EIGEN_COMP_GNUC && !defined(SYCL_DEVICE_ONLY) #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline #else #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE @@ -998,38 +862,7 @@ #define eigen_plain_assert(x) #endif #else - #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO - namespace Eigen { - namespace internal { - inline bool copy_bool(bool b) { return b; } - } - } #define eigen_plain_assert(x) assert(x) - #else - // work around bug 89 - #include // for abort - #include // for std::cerr - - namespace Eigen { - namespace internal { - // trivial function copying a bool. Must be EIGEN_DONT_INLINE, so we implement it after including Eigen headers. - // see bug 89. - namespace { - EIGEN_DONT_INLINE bool copy_bool(bool b) { return b; } - } - inline void assert_fail(const char *condition, const char *function, const char *file, int line) - { - std::cerr << "assertion failed: " << condition << " in function " << function << " at " << file << ":" << line << std::endl; - abort(); - } - } - } - #define eigen_plain_assert(x) \ - do { \ - if(!Eigen::internal::copy_bool(x)) \ - Eigen::internal::assert_fail(EIGEN_MAKESTRING(x), __PRETTY_FUNCTION__, __FILE__, __LINE__); \ - } while(false) - #endif #endif // eigen_assert can be overridden @@ -1067,10 +900,26 @@ #define EIGEN_UNUSED #endif +#if EIGEN_COMP_GNUC + #define EIGEN_PRAGMA(tokens) _Pragma(#tokens) + #define EIGEN_DIAGNOSTICS(tokens) EIGEN_PRAGMA(GCC diagnostic tokens) + #define EIGEN_DIAGNOSTICS_OFF(msc, gcc) EIGEN_DIAGNOSTICS(gcc) +#elif EIGEN_COMP_MSVC + #define EIGEN_PRAGMA(tokens) __pragma(tokens) + #define EIGEN_DIAGNOSTICS(tokens) EIGEN_PRAGMA(warning(tokens)) + #define EIGEN_DIAGNOSTICS_OFF(msc, gcc) EIGEN_DIAGNOSTICS(msc) +#else + #define EIGEN_PRAGMA(tokens) + #define EIGEN_DIAGNOSTICS(tokens) + #define EIGEN_DIAGNOSTICS_OFF(msc, gcc) +#endif + +#define EIGEN_DISABLE_DEPRECATED_WARNING EIGEN_DIAGNOSTICS_OFF(disable : 4996, ignored "-Wdeprecated-declarations") + // Suppresses 'unused variable' warnings. namespace Eigen { namespace internal { - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(const T&) {} } } #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); @@ -1130,8 +979,17 @@ namespace Eigen { // General, Altivec, VSX. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); #elif EIGEN_ARCH_ARM_OR_ARM64 - // General, NEON. - #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #ifdef __ARM_FP + // General, VFP or NEON. + // Clang doesn't like "r", + // error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector typ + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #else + // Arm without VFP or NEON. + // "w" constraint will not compile. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g" (X)); + #endif #elif EIGEN_ARCH_i386_OR_x86_64 // General, SSE. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); @@ -1185,8 +1043,8 @@ namespace Eigen { #define EIGEN_USING_STD(FUNC) using std::FUNC; #endif -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC)) - // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, +#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_NVCC + // Wwhen compiling with NVCC, using the base operator is necessary, // otherwise we get duplicate definition errors // For later MSVC versions, we require explicit operator= definition, otherwise we get // use of implicitly deleted operator errors. @@ -1215,11 +1073,7 @@ namespace Eigen { * \brief Macro to explicitly define the default copy constructor. * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. */ -#if EIGEN_HAS_CXX11 -#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default; -#else -#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) -#endif +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; @@ -1239,15 +1093,9 @@ namespace Eigen { * * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision */ -#if EIGEN_HAS_CXX11 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() = default; \ - ~Derived() = default; -#else -#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() {}; \ - /* ~Derived() {}; */ -#endif + EIGEN_DEVICE_FUNC Derived() = default; \ + EIGEN_DEVICE_FUNC ~Derived() = default; @@ -1285,35 +1133,6 @@ namespace Eigen { typedef typename Base::PacketScalar PacketScalar; -#define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b) -#define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b) - -// EIGEN_SIZE_MIN_PREFER_DYNAMIC gives the min between compile-time sizes. 0 has absolute priority, followed by 1, -// followed by Dynamic, followed by other finite values. The reason for giving Dynamic the priority over -// finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3. -#define EIGEN_SIZE_MIN_PREFER_DYNAMIC(a,b) (((int)a == 0 || (int)b == 0) ? 0 \ - : ((int)a == 1 || (int)b == 1) ? 1 \ - : ((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \ - : ((int)a <= (int)b) ? (int)a : (int)b) - -// EIGEN_SIZE_MIN_PREFER_FIXED is a variant of EIGEN_SIZE_MIN_PREFER_DYNAMIC comparing MaxSizes. The difference is that finite values -// now have priority over Dynamic, so that min(3, Dynamic) gives 3. Indeed, whatever the actual value is -// (between 0 and 3), it is not more than 3. -#define EIGEN_SIZE_MIN_PREFER_FIXED(a,b) (((int)a == 0 || (int)b == 0) ? 0 \ - : ((int)a == 1 || (int)b == 1) ? 1 \ - : ((int)a == Dynamic && (int)b == Dynamic) ? Dynamic \ - : ((int)a == Dynamic) ? (int)b \ - : ((int)b == Dynamic) ? (int)a \ - : ((int)a <= (int)b) ? (int)a : (int)b) - -// see EIGEN_SIZE_MIN_PREFER_DYNAMIC. No need for a separate variant for MaxSizes here. -#define EIGEN_SIZE_MAX(a,b) (((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \ - : ((int)a >= (int)b) ? (int)a : (int)b) - -#define EIGEN_LOGICAL_XOR(a,b) (((a) || (b)) && !((a) && (b))) - -#define EIGEN_IMPLIES(a,b) (!(a) || (b)) - #if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC #define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false)) #define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true)) @@ -1352,16 +1171,9 @@ namespace Eigen { CwiseBinaryOp::Scalar>, \ const typename internal::plain_constant_type::type, const EXPR> -// Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010") -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600) -#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type -#else -#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X -#endif - #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ - EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME))\ + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME)\ (METHOD)(const T& scalar) const { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \ @@ -1370,7 +1182,7 @@ namespace Eigen { #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \ - EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME)) \ + const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME) \ (METHOD)(const T& scalar, const StorageBaseType& matrix) { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \ @@ -1408,26 +1220,12 @@ namespace Eigen { #endif -#if EIGEN_HAS_CXX11_NOEXCEPT -# define EIGEN_INCLUDE_TYPE_TRAITS -# define EIGEN_NOEXCEPT noexcept -# define EIGEN_NOEXCEPT_IF(x) noexcept(x) -# define EIGEN_NO_THROW noexcept(true) -# define EIGEN_EXCEPTION_SPEC(X) noexcept(false) -#else -# define EIGEN_NOEXCEPT -# define EIGEN_NOEXCEPT_IF(x) -# define EIGEN_NO_THROW throw() -# if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17 - // MSVC does not support exception specifications (warning C4290), - // and they are deprecated in c++11 anyway. This is even an error in c++17. -# define EIGEN_EXCEPTION_SPEC(X) throw() -# else -# define EIGEN_EXCEPTION_SPEC(X) throw(X) -# endif -#endif +#define EIGEN_NOEXCEPT noexcept +#define EIGEN_NOEXCEPT_IF(x) noexcept(x) +#define EIGEN_NO_THROW noexcept(true) +#define EIGEN_EXCEPTION_SPEC(X) noexcept(false) + -#if EIGEN_HAS_VARIADIC_TEMPLATES // The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. namespace Eigen { namespace internal { @@ -1439,16 +1237,10 @@ bool all(T t, Ts ... ts){ return t && all(ts...); } } } -#endif -#if EIGEN_HAS_CXX11_OVERRIDE_FINAL // provide override and final specifiers if they are available: -# define EIGEN_OVERRIDE override -# define EIGEN_FINAL final -#else -# define EIGEN_OVERRIDE -# define EIGEN_FINAL -#endif +#define EIGEN_OVERRIDE override +#define EIGEN_FINAL final // Wrapping #pragma unroll in a macro since it is required for SYCL #if defined(SYCL_DEVICE_ONLY) @@ -1461,4 +1253,12 @@ bool all(T t, Ts ... ts){ return t && all(ts...); } #define EIGEN_UNROLL_LOOP #endif +// Notice: Use this macro with caution. The code in the if body should still +// compile with C++14. +#if defined(EIGEN_HAS_CXX17_IFCONSTEXPR) +#define EIGEN_IF_CONSTEXPR(X) if constexpr (X) +#else +#define EIGEN_IF_CONSTEXPR(X) if (X) +#endif + #endif // EIGEN_MACROS_H diff --git a/libs/eigen/Eigen/src/Core/util/Memory.h b/libs/eigen/Eigen/src/Core/util/Memory.h index 875318c..e4a8793 100644 --- a/libs/eigen/Eigen/src/Core/util/Memory.h +++ b/libs/eigen/Eigen/src/Core/util/Memory.h @@ -59,6 +59,8 @@ #endif +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -94,19 +96,17 @@ inline void throw_std_bad_alloc() /* ----- Hand made implementations of aligned malloc/free and realloc ----- */ -/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. - * Fast, but wastes 16 additional bytes of memory. Does not throw any exception. +/** \internal Like malloc, but the returned pointer is guaranteed to be aligned to `alignment`. + * Fast, but wastes `alignment` additional bytes of memory. Does not throw any exception. */ EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { - eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2"); - - EIGEN_USING_STD(malloc) - void *original = malloc(size+alignment); - + eigen_assert(alignment >= sizeof(void*) && alignment <= 128 && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*), less than or equal to 128, and a power of 2"); + void* original = std::malloc(size + alignment); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); - *(reinterpret_cast(aligned) - 1) = original; + uint8_t offset = static_cast(alignment - (reinterpret_cast(original) & (alignment - 1))); + void* aligned = static_cast(static_cast(original) + offset); + *(static_cast(aligned) - 1) = offset; return aligned; } @@ -114,8 +114,9 @@ EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::si EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) { if (ptr) { - EIGEN_USING_STD(free) - free(*(reinterpret_cast(ptr) - 1)); + uint8_t offset = static_cast(*(static_cast(ptr) - 1)); + void* original = static_cast(static_cast(ptr) - offset); + std::free(original); } } @@ -124,19 +125,22 @@ EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) * Since we know that our handmade version is based on std::malloc * we can use std::realloc to implement efficient reallocation. */ -inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0) +EIGEN_DEVICE_FUNC inline void* handmade_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { - if (ptr == 0) return handmade_aligned_malloc(size); - void *original = *(reinterpret_cast(ptr) - 1); - std::ptrdiff_t previous_offset = static_cast(ptr)-static_cast(original); - original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES); + if (ptr == 0) return handmade_aligned_malloc(new_size, alignment); + uint8_t old_offset = *(static_cast(ptr) - 1); + void* old_original = static_cast(ptr) - old_offset; + void* original = std::realloc(old_original, new_size + alignment); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); - void *previous_aligned = static_cast(original)+previous_offset; - if(aligned!=previous_aligned) - std::memmove(aligned, previous_aligned, size); - - *(reinterpret_cast(aligned) - 1) = original; + if (original == old_original) return ptr; + uint8_t offset = static_cast(alignment - (reinterpret_cast(original) & (alignment - 1))); + void* aligned = static_cast(static_cast(original) + offset); + if (offset != old_offset) { + const void* src = static_cast(static_cast(original) + old_offset); + std::size_t count = (std::min)(new_size, old_size); + std::memmove(aligned, src, count); + } + *(static_cast(aligned) - 1) = offset; return aligned; } @@ -212,12 +216,12 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) * \brief Reallocates an aligned block of memory. * \throws std::bad_alloc on allocation failure */ -inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size) +EIGEN_DEVICE_FUNC inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size) { - EIGEN_UNUSED_VARIABLE(old_size) - + if (ptr == 0) return aligned_malloc(new_size); void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + EIGEN_UNUSED_VARIABLE(old_size) result = std::realloc(ptr,new_size); #else result = handmade_aligned_realloc(ptr,new_size,old_size); @@ -226,6 +230,11 @@ inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_si if (!result && new_size) throw_std_bad_alloc(); +#ifdef EIGEN_RUNTIME_NO_MALLOC + if (result != ptr) + check_that_malloc_is_allowed(); +#endif + return result; } @@ -265,12 +274,12 @@ template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *p free(ptr); } -template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) +template EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) { return aligned_realloc(ptr, new_size, old_size); } -template<> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t) +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t) { return std::realloc(ptr, new_size); } @@ -292,20 +301,55 @@ template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size) +template EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_array(T *ptr, std::size_t size) { - std::size_t i; + std::size_t i=0; EIGEN_TRY { for (i = 0; i < size; ++i) ::new (ptr + i) T; - return ptr; } EIGEN_CATCH(...) { destruct_elements_of_array(ptr, i); EIGEN_THROW; } - return NULL; + return ptr; +} + +/** \internal Copy-constructs the elements of an array. + * The \a size parameter tells on how many objects to copy. + */ +template EIGEN_DEVICE_FUNC inline T* copy_construct_elements_of_array(T *ptr, const T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { + for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i)); + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; +} + +/** \internal Move-constructs the elements of an array. + * The \a size parameter tells on how many objects to move. + */ +template EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T *ptr, T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { + for (i = 0; i < size; ++i) ::new (ptr + i) T(std::move(*(src + i))); + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; } /***************************************************************************** @@ -326,10 +370,10 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t s template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); + T *result = static_cast(aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -342,10 +386,10 @@ template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -377,21 +421,32 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned { check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(new_size < old_size) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(new_size > old_size) + + // If elements need to be explicitly initialized, we cannot simply realloc + // (or memcpy) the memory block - each element needs to be reconstructed. + // Otherwise, objects that contain internal pointers like mpfr or + // AnnoyingScalar can be pointing to the wrong thing. + T* result = static_cast(conditional_aligned_malloc(sizeof(T)*new_size)); + EIGEN_TRY { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; + // Move-construct initial elements. + std::size_t copy_size = (std::min)(old_size, new_size); + move_construct_elements_of_array(result, pts, copy_size); + + // Default-construct remaining elements. + if (new_size > old_size) { + default_construct_elements_of_array(result + copy_size, new_size - old_size); } + + // Delete old elements. + conditional_aligned_delete(pts, old_size); } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; + } + return result; } @@ -401,12 +456,12 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned if(size==0) return 0; // short-cut. Also fixes Bug 884 check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); if(NumTraits::RequireInitialization) { EIGEN_TRY { - construct_elements_of_array(result, size); + default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -417,26 +472,15 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned return result; } -template inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) { + if (NumTraits::RequireInitialization) { + return conditional_aligned_realloc_new(pts, new_size, old_size); + } + check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(NumTraits::RequireInitialization && (new_size < old_size)) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(NumTraits::RequireInitialization && (new_size > old_size)) - { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; - } - } - return result; + return static_cast(conditional_aligned_realloc(static_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); } template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size) @@ -566,17 +610,10 @@ template struct smart_memmove_helper { } }; -#if EIGEN_HAS_RVALUE_REFERENCES template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) { return std::move(start, end, target); } -#else -template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) -{ - return std::copy(start, end, target); -} -#endif /***************************************************************************** *** Implementation of runtime stack allocation (falling back to malloc) *** @@ -617,7 +654,7 @@ template class aligned_stack_memory_handler : noncopyable : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) - Eigen::internal::construct_elements_of_array(m_ptr, size); + Eigen::internal::default_construct_elements_of_array(m_ptr, size); } EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() @@ -640,7 +677,7 @@ template struct local_nested_eval_wrapper { - static const bool NeedExternalBuffer = false; + static constexpr bool NeedExternalBuffer = false; typedef typename Xpr::Scalar Scalar; typedef typename nested_eval::type ObjectType; ObjectType object; @@ -656,7 +693,7 @@ struct local_nested_eval_wrapper template struct local_nested_eval_wrapper { - static const bool NeedExternalBuffer = true; + static constexpr bool NeedExternalBuffer = true; typedef typename Xpr::Scalar Scalar; typedef typename plain_object_eval::type PlainObject; typedef Map ObjectType; @@ -668,7 +705,7 @@ struct local_nested_eval_wrapper m_deallocate(ptr==0) { if(NumTraits::RequireInitialization && object.data()) - Eigen::internal::construct_elements_of_array(object.data(), object.size()); + Eigen::internal::default_construct_elements_of_array(object.data(), object.size()); object = xpr; } @@ -853,7 +890,7 @@ template void swap(scoped_array &a,scoped_array &b) /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* \brief STL compatible allocator to use with types requiring a non-standard alignment. * * The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. * By default, it will thus provide at least 16 bytes alignment and more in following cases: @@ -941,7 +978,7 @@ public: __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) ); # endif # elif EIGEN_COMP_MSVC -# if (EIGEN_COMP_MSVC > 1500) && EIGEN_ARCH_i386_OR_x86_64 +# if EIGEN_ARCH_i386_OR_x86_64 # define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id) # endif # endif @@ -1156,6 +1193,38 @@ inline int queryTopLevelCacheSize() return (std::max)(l2,l3); } + + +/** \internal + * This wraps C++20's std::construct_at, using placement new instead if it is not available. + */ + +#if EIGEN_COMP_CXXVER >= 20 +using std::construct_at; +#else +template +EIGEN_DEVICE_FUNC T* construct_at( T* p, Args&&... args ) +{ + return ::new (const_cast(static_cast(p))) + T(std::forward(args)...); +} +#endif + +/** \internal + * This wraps C++17's std::destroy_at. If it's not available it calls the destructor. + * The wrapper is not a full replacement for C++20's std::destroy_at as it cannot + * be applied to std::array. + */ +#if EIGEN_COMP_CXXVER >= 17 +using std::destroy_at; +#else +template +EIGEN_DEVICE_FUNC void destroy_at(T* p) +{ + p->~T(); +} +#endif + } // end namespace internal } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/Core/util/Meta.h b/libs/eigen/Eigen/src/Core/util/Meta.h old mode 100755 new mode 100644 index 81ae2a3..6c6fb71 --- a/libs/eigen/Eigen/src/Core/util/Meta.h +++ b/libs/eigen/Eigen/src/Core/util/Meta.h @@ -11,6 +11,8 @@ #ifndef EIGEN_META_H #define EIGEN_META_H +#include "../InternalHeaderCheck.h" + #if defined(EIGEN_GPU_COMPILE_PHASE) #include @@ -26,11 +28,11 @@ #endif // Recent versions of ICC require for pointer types below. -#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11) +#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600) // Define portable (u)int{32,64} types -#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT #include + namespace Eigen { namespace numext { typedef std::uint8_t uint8_t; @@ -41,25 +43,34 @@ typedef std::uint32_t uint32_t; typedef std::int32_t int32_t; typedef std::uint64_t uint64_t; typedef std::int64_t int64_t; + +template +struct get_integer_by_size { + typedef void signed_type; + typedef void unsigned_type; +}; +template <> +struct get_integer_by_size<1> { + typedef int8_t signed_type; + typedef uint8_t unsigned_type; +}; +template <> +struct get_integer_by_size<2> { + typedef int16_t signed_type; + typedef uint16_t unsigned_type; +}; +template <> +struct get_integer_by_size<4> { + typedef int32_t signed_type; + typedef uint32_t unsigned_type; +}; +template <> +struct get_integer_by_size<8> { + typedef int64_t signed_type; + typedef uint64_t unsigned_type; +}; } } -#else -// Without c++11, all compilers able to compile Eigen also -// provide the C99 stdint.h header file. -#include -namespace Eigen { -namespace numext { -typedef ::uint8_t uint8_t; -typedef ::int8_t int8_t; -typedef ::uint16_t uint16_t; -typedef ::int16_t int16_t; -typedef ::uint32_t uint32_t; -typedef ::int32_t int32_t; -typedef ::uint64_t uint64_t; -typedef ::int64_t int64_t; -} -} -#endif namespace Eigen { @@ -105,23 +116,11 @@ struct bool_constant : true_type {}; template<> struct bool_constant : false_type {}; -template -struct conditional { typedef Then type; }; - -template -struct conditional { typedef Else type; }; - -template struct remove_reference { typedef T type; }; -template struct remove_reference { typedef T type; }; - -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { typedef T type; }; - -template struct remove_const { typedef T type; }; -template struct remove_const { typedef T type; }; -template struct remove_const { typedef T type[]; }; -template struct remove_const { typedef T type[Size]; }; +// Third-party libraries rely on these. +using std::conditional; +using std::remove_reference; +using std::remove_pointer; +using std::remove_const; template struct remove_all { typedef T type; }; template struct remove_all { typedef typename remove_all::type type; }; @@ -130,6 +129,9 @@ template struct remove_all { typedef typename remove_all< template struct remove_all { typedef typename remove_all::type type; }; template struct remove_all { typedef typename remove_all::type type; }; +template +using remove_all_t = typename remove_all::type; + template struct is_arithmetic { enum { value = false }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; @@ -149,64 +151,13 @@ template struct is_same { enum { value = 0 }; }; template struct is_same { enum { value = 1 }; }; template< class T > -struct is_void : is_same::type> {}; +struct is_void : is_same> {}; -#if EIGEN_HAS_CXX11 template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; using std::is_integral; -#else -template struct is_integral { enum { value = false }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -#if EIGEN_COMP_MSVC -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -#endif -#endif -#if EIGEN_HAS_CXX11 using std::make_unsigned; -#else -// TODO: Possibly improve this implementation of make_unsigned. -// It is currently used only by -// template struct random_default_impl. -template struct make_unsigned; -template<> struct make_unsigned { typedef unsigned char type; }; -template<> struct make_unsigned { typedef unsigned char type; }; -template<> struct make_unsigned { typedef unsigned char type; }; -template<> struct make_unsigned { typedef unsigned short type; }; -template<> struct make_unsigned { typedef unsigned short type; }; -template<> struct make_unsigned { typedef unsigned int type; }; -template<> struct make_unsigned { typedef unsigned int type; }; -template<> struct make_unsigned { typedef unsigned long type; }; -template<> struct make_unsigned { typedef unsigned long type; }; -#if EIGEN_COMP_MSVC -template<> struct make_unsigned { typedef unsigned __int64 type; }; -template<> struct make_unsigned { typedef unsigned __int64 type; }; -#endif - -// Some platforms define int64_t as `long long` even for C++03, where -// `long long` is not guaranteed by the standard. In this case we are missing -// the definition for make_unsigned. If we just define it, we run into issues -// where `long long` doesn't exist in some compilers for C++03. We therefore add -// the specialization for these platforms only. -#if EIGEN_OS_MAC || EIGEN_COMP_MINGW -template<> struct make_unsigned { typedef unsigned long long type; }; -template<> struct make_unsigned { typedef unsigned long long type; }; -#endif -#endif - -template struct add_const { typedef const T type; }; -template struct add_const { typedef T& type; }; template struct is_const { enum { value = 0 }; }; template struct is_const { enum { value = 1 }; }; @@ -217,205 +168,11 @@ template struct add_const_on_value_type { typedef T const template struct add_const_on_value_type { typedef T const* const type; }; template struct add_const_on_value_type { typedef T const* const type; }; -#if EIGEN_HAS_CXX11 +template +using add_const_on_value_type_t = typename add_const_on_value_type::type; using std::is_convertible; -#else - -template -struct is_convertible_impl -{ -private: - struct any_conversion - { - template any_conversion(const volatile T&); - template any_conversion(T&); - }; - struct yes {int a[1];}; - struct no {int a[2];}; - - template - static yes test(T, int); - - template - static no test(any_conversion, ...); - -public: - static typename internal::remove_reference::type* ms_from; -#ifdef __INTEL_COMPILER - #pragma warning push - #pragma warning ( disable : 2259 ) -#endif - enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; -#ifdef __INTEL_COMPILER - #pragma warning pop -#endif -}; - -template -struct is_convertible -{ - enum { value = is_convertible_impl::value }; -}; - -template -struct is_convertible { enum { value = false }; }; - -template -struct is_convertible { enum { value = true }; }; - -#endif - -/** \internal Allows to enable/disable an overload - * according to a compile time condition. - */ -template struct enable_if; - -template struct enable_if -{ typedef T type; }; - -#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 -#if !defined(__FLT_EPSILON__) -#define __FLT_EPSILON__ FLT_EPSILON -#define __DBL_EPSILON__ DBL_EPSILON -#endif - -namespace device { - -template struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static EIGEN_CONSTEXPR T epsilon() { return 0; } - static T (max)() { assert(false && "Highest not supported for this type"); } - static T (min)() { assert(false && "Lowest not supported for this type"); } - static T infinity() { assert(false && "Infinity not supported for this type"); } - static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static float epsilon() { return __FLT_EPSILON__; } - EIGEN_DEVICE_FUNC - static float (max)() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_MAX_NORMAL_F; - #else - return HIPRT_MAX_NORMAL_F; - #endif - } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static float (min)() { return FLT_MIN; } - EIGEN_DEVICE_FUNC - static float infinity() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_INF_F; - #else - return HIPRT_INF_F; - #endif - } - EIGEN_DEVICE_FUNC - static float quiet_NaN() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_NAN_F; - #else - return HIPRT_NAN_F; - #endif - } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static double epsilon() { return __DBL_EPSILON__; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static double (max)() { return DBL_MAX; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static double (min)() { return DBL_MIN; } - EIGEN_DEVICE_FUNC - static double infinity() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_INF; - #else - return HIPRT_INF; - #endif - } - EIGEN_DEVICE_FUNC - static double quiet_NaN() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_NAN; - #else - return HIPRT_NAN; - #endif - } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static int epsilon() { return 0; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static int (max)() { return INT_MAX; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static int (min)() { return INT_MIN; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned int epsilon() { return 0; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned int (max)() { return UINT_MAX; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned int (min)() { return 0; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static long epsilon() { return 0; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static long (max)() { return LONG_MAX; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static long (min)() { return LONG_MIN; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned long epsilon() { return 0; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned long (max)() { return ULONG_MAX; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned long (min)() { return 0; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static long long (max)() { return LLONG_MAX; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static long long (min)() { return LLONG_MIN; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned long long (max)() { return ULLONG_MAX; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static unsigned long long (min)() { return 0; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static bool epsilon() { return false; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static bool (max)() { return true; } - EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR - static bool (min)() { return false; } -}; - -} - -#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 - /** \internal * A base class do disable default copy ctor and copy assignment operator. */ @@ -446,7 +203,7 @@ template struct array_size { enum { value = Dynamic }; }; -template struct array_size::type> { +template struct array_size> { enum { value = T::SizeAtCompileTime }; }; @@ -457,44 +214,50 @@ template struct array_size { enum { value = N }; }; -#if EIGEN_HAS_CXX11 template struct array_size > { enum { value = N }; }; template struct array_size > { enum { value = N }; }; -#endif + /** \internal - * Analogue of the std::size free function. - * It returns the size of the container or view \a x of type \c T + * Analogue of the std::ssize free function. + * It returns the signed size of the container or view \a x of type \c T * * It currently supports: * - any types T defining a member T::size() const * - plain C arrays as T[N] * + * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -template -EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } +#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_AT_MOST(9,4) +template +EIGEN_CONSTEXPR auto index_list_size(const T& x) { + using R = std::common_type_t>; + return static_cast(x.size()); +} -template -EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } +template +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } +#else +template +EIGEN_CONSTEXPR auto index_list_size(T&& x) { + using std::ssize; + return ssize(std::forward(x)); +} +#endif // EIGEN_COMP_CXXVER /** \internal * Convenient struct to get the result type of a nullary, unary, binary, or * ternary functor. - * - * Pre C++11: - * Supports both a Func::result_type member and templated - * Func::result::type member. - * - * If none of these members is provided, then the type of the first - * argument is returned. - * - * Post C++11: + * + * Pre C++17: * This uses std::result_of. However, note the `type` member removes * const and converts references/pointers to their corresponding value type. + * + * Post C++17: Uses std::invoke_result */ #if EIGEN_HAS_STD_INVOKE_RESULT template struct result_of; @@ -502,152 +265,37 @@ template struct result_of; template struct result_of { typedef typename std::invoke_result::type type1; - typedef typename remove_all::type type; -}; -#elif EIGEN_HAS_STD_RESULT_OF -template struct result_of { - typedef typename std::result_of::type type1; - typedef typename remove_all::type type; -}; -#else -template struct result_of { }; - -struct has_none {int a[1];}; -struct has_std_result_type {int a[2];}; -struct has_tr1_result {int a[3];}; - -template -struct nullary_result_of_select {}; - -template -struct nullary_result_of_select {typedef typename Func::result_type type;}; - -template -struct nullary_result_of_select {typedef typename Func::template result::type type;}; - -template -struct result_of { - template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); - template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); - - // note that the following indirection is needed for gcc-3.3 - enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; - typedef typename nullary_result_of_select::type type; + typedef remove_all_t type; }; -template -struct unary_result_of_select {typedef typename internal::remove_all::type type;}; - -template -struct unary_result_of_select {typedef typename Func::result_type type;}; - -template -struct unary_result_of_select {typedef typename Func::template result::type type;}; - -template -struct result_of { - template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); - template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); - - // note that the following indirection is needed for gcc-3.3 - enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; - typedef typename unary_result_of_select::type type; -}; - -template -struct binary_result_of_select {typedef typename internal::remove_all::type type;}; - -template -struct binary_result_of_select -{typedef typename Func::result_type type;}; - -template -struct binary_result_of_select -{typedef typename Func::template result::type type;}; - -template -struct result_of { - template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); - template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); - - // note that the following indirection is needed for gcc-3.3 - enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; - typedef typename binary_result_of_select::type type; -}; - -template -struct ternary_result_of_select {typedef typename internal::remove_all::type type;}; - -template -struct ternary_result_of_select -{typedef typename Func::result_type type;}; - -template -struct ternary_result_of_select -{typedef typename Func::template result::type type;}; - -template -struct result_of { - template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); - template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); - - // note that the following indirection is needed for gcc-3.3 - enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; - typedef typename ternary_result_of_select::type type; -}; - -#endif - -#if EIGEN_HAS_STD_INVOKE_RESULT template struct invoke_result { typedef typename std::invoke_result::type type1; - typedef typename remove_all::type type; -}; -#elif EIGEN_HAS_CXX11 -template -struct invoke_result { - typedef typename result_of::type type1; - typedef typename remove_all::type type; + typedef remove_all_t type; }; #else -template +template struct result_of { + typedef typename std::result_of::type type1; + typedef remove_all_t type; +}; + +template struct invoke_result { - typedef typename result_of::type type1; - typedef typename remove_all::type type; -}; - -template -struct invoke_result { - typedef typename result_of::type type1; - typedef typename remove_all::type type; -}; - -template -struct invoke_result { - typedef typename result_of::type type1; - typedef typename remove_all::type type; -}; - -template -struct invoke_result { - typedef typename result_of::type type1; - typedef typename remove_all::type type; + typedef typename result_of::type type1; + typedef remove_all_t type; }; #endif +// Reduces a sequence of bools to true if all are true, false otherwise. +template +using reduce_all = std::is_same, + std::integer_sequence >; + +// Reduces a sequence of bools to true if any are true, false if all false. +template +using reduce_any = std::integral_constant, std::integer_sequence >::value>; + struct meta_yes { char a[1]; }; struct meta_no { char a[2]; }; @@ -666,7 +314,7 @@ template const T* return_ptr(); template struct has_nullary_operator { - template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()())>0)>::type * = 0); + template static meta_yes testFunctor(C const *,std::enable_if_t<(sizeof(return_ptr()->operator()())>0)> * = 0); static meta_no testFunctor(...); enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; @@ -675,7 +323,7 @@ struct has_nullary_operator template struct has_unary_operator { - template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0)))>0)>::type * = 0); + template static meta_yes testFunctor(C const *,std::enable_if_t<(sizeof(return_ptr()->operator()(IndexType(0)))>0)> * = 0); static meta_no testFunctor(...); enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; @@ -684,7 +332,7 @@ struct has_unary_operator template struct has_binary_operator { - template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0); + template static meta_yes testFunctor(C const *,std::enable_if_t<(sizeof(return_ptr()->operator()(IndexType(0),IndexType(0)))>0)> * = 0); static meta_no testFunctor(...); enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; @@ -696,8 +344,7 @@ struct has_binary_operator template Y))) > - // use ?: instead of || just to shut up a stupid gcc 4.3 warning + bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))> class meta_sqrt { enum { @@ -743,7 +390,7 @@ template struct scalar_product_traits // FIXME quick workaround around current limitation of result_of // template // struct result_of(ArgType0,ArgType1)> { -// typedef typename scalar_product_traits::type, typename remove_all::type>::ReturnType type; +// typedef typename scalar_product_traits, remove_all_t>::ReturnType type; // }; /** \internal Obtains a POD type suitable to use as storage for an object of a size @@ -758,6 +405,8 @@ struct aligned_storage { } // end namespace internal +template struct NumTraits; + namespace numext { #if defined(EIGEN_GPU_COMPILE_PHASE) @@ -766,11 +415,7 @@ template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 -using internal::device::numeric_limits; -#else using std::numeric_limits; -#endif // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 @@ -794,6 +439,20 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } #endif +/** + * \internal Performs an exact comparison of x to zero, e.g. to decide whether a term can be ignored. + * Use this to to bypass -Wfloat-equal warnings when exact zero is what needs to be tested. +*/ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool is_exactly_zero(const X& x) { return equal_strict(x, typename NumTraits::Literal{0}); } + +/** + * \internal Performs an exact comparison of x to one, e.g. to decide whether a factor needs to be multiplied. + * Use this to to bypass -Wfloat-equal warnings when exact one is what needs to be tested. +*/ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool is_exactly_one(const X& x) { return equal_strict(x, typename NumTraits::Literal{1}); } + template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } @@ -807,6 +466,101 @@ bool not_equal_strict(const double& x,const double& y) { return std::not_equal_t } // end namespace numext +namespace internal { + +template +struct is_identically_zero_impl { + static inline bool run(const Scalar& s) { + return numext::is_exactly_zero(s); + } +}; + +template EIGEN_STRONG_INLINE +bool is_identically_zero(const Scalar& s) { return is_identically_zero_impl::run(s); } + +/// \internal Returns true if its argument is of integer or enum type. +/// FIXME this has the same purpose as `is_valid_index_type` in XprHelper.h +template +constexpr bool is_int_or_enum_v = std::is_enum::value || std::is_integral::value; + +/// \internal Gets the minimum of two values which may be integers or enums +template +inline constexpr int plain_enum_min(A a, B b) { + static_assert(is_int_or_enum_v, "Argument a must be an integer or enum"); + static_assert(is_int_or_enum_v, "Argument b must be an integer or enum"); + return ((int) a <= (int) b) ? (int) a : (int) b; +} + +/// \internal Gets the maximum of two values which may be integers or enums +template +inline constexpr int plain_enum_max(A a, B b) { + static_assert(is_int_or_enum_v, "Argument a must be an integer or enum"); + static_assert(is_int_or_enum_v, "Argument b must be an integer or enum"); + return ((int) a >= (int) b) ? (int) a : (int) b; +} + +/** + * \internal + * `min_size_prefer_dynamic` gives the min between compile-time sizes. 0 has absolute priority, followed by 1, + * followed by Dynamic, followed by other finite values. The reason for giving Dynamic the priority over + * finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3. + */ +template +inline constexpr int min_size_prefer_dynamic(A a, B b) { + static_assert(is_int_or_enum_v, "Argument a must be an integer or enum"); + static_assert(is_int_or_enum_v, "Argument b must be an integer or enum"); + if ((int) a == 0 || (int) b == 0) return 0; + if ((int) a == 1 || (int) b == 1) return 1; + if ((int) a == Dynamic || (int) b == Dynamic) return Dynamic; + return plain_enum_min(a, b); +} + +/** + * \internal + * min_size_prefer_fixed is a variant of `min_size_prefer_dynamic` comparing MaxSizes. The difference is that finite values + * now have priority over Dynamic, so that min(3, Dynamic) gives 3. Indeed, whatever the actual value is + * (between 0 and 3), it is not more than 3. + */ +template +inline constexpr int min_size_prefer_fixed(A a, B b) { + static_assert(is_int_or_enum_v, "Argument a must be an integer or enum"); + static_assert(is_int_or_enum_v, "Argument b must be an integer or enum"); + if ((int) a == 0 || (int) b == 0) return 0; + if ((int) a == 1 || (int) b == 1) return 1; + if ((int) a == Dynamic && (int) b == Dynamic) return Dynamic; + if ((int) a == Dynamic) return (int) b; + if ((int) b == Dynamic) return (int) a; + return plain_enum_min(a, b); +} + +/// \internal see `min_size_prefer_fixed`. No need for a separate variant for MaxSizes here. +template +inline constexpr int max_size_prefer_dynamic(A a, B b) { + static_assert(is_int_or_enum_v, "Argument a must be an integer or enum"); + static_assert(is_int_or_enum_v, "Argument b must be an integer or enum"); + if ((int) a == Dynamic || (int) b == Dynamic) return Dynamic; + return plain_enum_max(a, b); +} + +/// \internal Calculate logical XOR at compile time +inline constexpr bool logical_xor(bool a, bool b) { + return a != b; +} + +/// \internal Calculate logical IMPLIES at compile time +inline constexpr bool check_implication(bool a, bool b) { + return !a || b; +} + +/// \internal Provide fallback for std::is_constant_evaluated for pre-C++20. +#if EIGEN_COMP_CXXVER >= 20 +using std::is_constant_evaluated; +#else +constexpr bool is_constant_evaluated() { return false; } +#endif + +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_META_H diff --git a/libs/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h b/libs/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h index 1ce6fd1..7021e6d 100644 --- a/libs/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/libs/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -1,5 +1,5 @@ #ifdef EIGEN_WARNINGS_DISABLED_2 -// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet! +// "DisableStupidWarnings.h" was included twice recursively: Do not re-enable warnings yet! # undef EIGEN_WARNINGS_DISABLED_2 #elif defined(EIGEN_WARNINGS_DISABLED) @@ -12,18 +12,26 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) + #elif defined __GNUC__ && !defined(__FUJITSU) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic pop #endif #if defined __NVCC__ -// Don't reenable the diagnostic messages, as it turns out these messages need +// Don't re-enable the diagnostic messages, as it turns out these messages need // to be disabled at the point of the template instantiation (i.e the user code) // otherwise they'll be triggered by nvcc. -// #pragma diag_default code_is_unreachable -// #pragma diag_default initialization_not_reachable -// #pragma diag_default 2651 -// #pragma diag_default 2653 +// #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X) +// #if __NVCC_DIAG_PRAGMA_SUPPORT__ +// #define EIGEN_NV_DIAG_DEFAULT(X) EIGEN_MAKE_PRAGMA(nv_diag_default X) +// #else +// #define EIGEN_NV_DIAG_DEFAULT(X) EIGEN_MAKE_PRAGMA(diag_default X) +// #endif +// EIGEN_NV_DIAG_DEFAULT(code_is_unreachable) +// EIGEN_NV_DIAG_DEFAULT(initialization_not_reachable) +// EIGEN_NV_DIAG_DEFAULT(2651) +// EIGEN_NV_DIAG_DEFAULT(2653) +// #undef EIGEN_NV_DIAG_DEFAULT +// #undef EIGEN_MAKE_PRAGMA #endif #endif diff --git a/libs/eigen/Eigen/src/Core/util/ReshapedHelper.h b/libs/eigen/Eigen/src/Core/util/ReshapedHelper.h index 4124321..6daea03 100644 --- a/libs/eigen/Eigen/src/Core/util/ReshapedHelper.h +++ b/libs/eigen/Eigen/src/Core/util/ReshapedHelper.h @@ -11,6 +11,8 @@ #ifndef EIGEN_RESHAPED_HELPER_H #define EIGEN_RESHAPED_HELPER_H +#include "../InternalHeaderCheck.h" + namespace Eigen { enum AutoSize_t { AutoSize }; @@ -39,10 +41,9 @@ inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index to return total/other; } -template -struct get_compiletime_reshape_order { - enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order }; -}; +constexpr inline int get_compiletime_reshape_order(int flags, int order) { + return order == AutoOrder ? flags & RowMajorBit : order; +} } diff --git a/libs/eigen/Eigen/src/Core/util/Serializer.h b/libs/eigen/Eigen/src/Core/util/Serializer.h new file mode 100644 index 0000000..cbfc04a --- /dev/null +++ b/libs/eigen/Eigen/src/Core/util/Serializer.h @@ -0,0 +1,220 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 The Eigen Team +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SERIALIZER_H +#define EIGEN_SERIALIZER_H + +#include + +// The Serializer class encodes data into a memory buffer so it can be later +// reconstructed. This is mainly used to send objects back-and-forth between +// the CPU and GPU. + +namespace Eigen { + +/** + * Serializes an object to a memory buffer. + * + * Useful for transferring data (e.g. back-and-forth to a device). + */ +template +class Serializer; + +// Specialization for POD types. +template +class Serializer::value + && std::is_standard_layout::value>> { + public: + + /** + * Determines the required size of the serialization buffer for a value. + * + * \param value the value to serialize. + * \return the required size. + */ + EIGEN_DEVICE_FUNC size_t size(const T& value) const { + return sizeof(value); + } + + /** + * Serializes a value to a byte buffer. + * \param dest the destination buffer; if this is nullptr, does nothing. + * \param end the end of the destination buffer. + * \param value the value to serialize. + * \return the next memory address past the end of the serialized data. + */ + EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const T& value) { + if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(dest + sizeof(value) > end)) return nullptr; + EIGEN_USING_STD(memcpy) + memcpy(dest, &value, sizeof(value)); + return dest + sizeof(value); + } + + /** + * Deserializes a value from a byte buffer. + * \param src the source buffer; if this is nullptr, does nothing. + * \param end the end of the source buffer. + * \param value the value to populate. + * \return the next unprocessed memory address; nullptr if parsing errors are detected. + */ + EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, T& value) const { + if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(src + sizeof(value) > end)) return nullptr; + EIGEN_USING_STD(memcpy) + memcpy(&value, src, sizeof(value)); + return src + sizeof(value); + } +}; + +// Specialization for DenseBase. +// Serializes [rows, cols, data...]. +template +class Serializer, void> { + public: + typedef typename Derived::Scalar Scalar; + + struct Header { + typename Derived::Index rows; + typename Derived::Index cols; + }; + + EIGEN_DEVICE_FUNC size_t size(const Derived& value) const { + return sizeof(Header) + sizeof(Scalar) * value.size(); + } + + EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const Derived& value) { + if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr; + const size_t header_bytes = sizeof(Header); + const size_t data_bytes = sizeof(Scalar) * value.size(); + Header header = {value.rows(), value.cols()}; + EIGEN_USING_STD(memcpy) + memcpy(dest, &header, header_bytes); + dest += header_bytes; + memcpy(dest, value.data(), data_bytes); + return dest + data_bytes; + } + + EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, Derived& value) const { + if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr; + const size_t header_bytes = sizeof(Header); + Header header; + EIGEN_USING_STD(memcpy) + memcpy(&header, src, header_bytes); + src += header_bytes; + const size_t data_bytes = sizeof(Scalar) * header.rows * header.cols; + if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr; + value.resize(header.rows, header.cols); + memcpy(value.data(), src, data_bytes); + return src + data_bytes; + } +}; + +template +class Serializer > : public + Serializer > > {}; + +template +class Serializer > : public + Serializer > > {}; + +namespace internal { + +// Recursive serialization implementation helper. +template +struct serialize_impl; + +template +struct serialize_impl { + using Serializer = Eigen::Serializer::type>; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t serialize_size(const T1& value, const Ts&... args) { + Serializer serializer; + size_t size = serializer.size(value); + return size + serialize_impl::serialize_size(args...); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + uint8_t* serialize(uint8_t* dest, uint8_t* end, const T1& value, const Ts&... args) { + Serializer serializer; + dest = serializer.serialize(dest, end, value); + return serialize_impl::serialize(dest, end, args...); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, T1& value, Ts&... args) { + Serializer serializer; + src = serializer.deserialize(src, end, value); + return serialize_impl::deserialize(src, end, args...); + } +}; + +// Base case. +template<> +struct serialize_impl<0> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t serialize_size() { return 0; } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + uint8_t* serialize(uint8_t* dest, uint8_t* /*end*/) { return dest; } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const uint8_t* deserialize(const uint8_t* src, const uint8_t* /*end*/) { return src; } +}; + +} // namespace internal + + +/** + * Determine the buffer size required to serialize a set of values. + * + * \param args ... arguments to serialize in sequence. + * \return the total size of the required buffer. + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +size_t serialize_size(const Args&... args) { + return internal::serialize_impl::serialize_size(args...); +} + +/** + * Serialize a set of values to the byte buffer. + * + * \param dest output byte buffer; if this is nullptr, does nothing. + * \param end the end of the output byte buffer. + * \param args ... arguments to serialize in sequence. + * \return the next address after all serialized values. + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +uint8_t* serialize(uint8_t* dest, uint8_t* end, const Args&... args) { + return internal::serialize_impl::serialize(dest, end, args...); +} + +/** + * Deserialize a set of values from the byte buffer. + * + * \param src input byte buffer; if this is nullptr, does nothing. + * \param end the end of input byte buffer. + * \param args ... arguments to deserialize in sequence. + * \return the next address after all parsed values; nullptr if parsing errors are detected. + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, Args&... args) { + return internal::serialize_impl::deserialize(src, end, args...); +} + +} // namespace Eigen + +#endif // EIGEN_SERIALIZER_H diff --git a/libs/eigen/Eigen/src/Core/util/StaticAssert.h b/libs/eigen/Eigen/src/Core/util/StaticAssert.h index c45de59..c938eb8 100644 --- a/libs/eigen/Eigen/src/Core/util/StaticAssert.h +++ b/libs/eigen/Eigen/src/Core/util/StaticAssert.h @@ -16,10 +16,6 @@ * - in EIGEN_STATIC_ASSERT(CONDITION,MSG) the parameter CONDITION must be a compile time boolean * expression, and MSG an enum listed in struct internal::static_assertion * - * - define EIGEN_NO_STATIC_ASSERT to disable them (and save compilation time) - * in that case, the static assertion is converted to the following runtime assert: - * eigen_assert(CONDITION && "MSG") - * * - currently EIGEN_STATIC_ASSERT can only be used in function scope * */ @@ -27,113 +23,11 @@ #ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT - #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) - - // if native static_assert is enabled, let's use it - #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); - - #else // not CXX0X - - namespace Eigen { - - namespace internal { - - template - struct static_assertion {}; - - template<> - struct static_assertion - { - enum { - YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1, - YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1, - YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1, - THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1, - THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1, - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1, - OUT_OF_RANGE_ACCESS=1, - YOU_MADE_A_PROGRAMMING_MISTAKE=1, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1, - EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1, - YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1, - YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1, - UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1, - THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1, - FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1, - NUMERIC_TYPE_MUST_BE_REAL=1, - COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1, - WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1, - THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1, - INVALID_MATRIX_PRODUCT=1, - INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1, - INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1, - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1, - THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1, - THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1, - INVALID_MATRIX_TEMPLATE_PARAMETERS=1, - INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1, - BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1, - THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1, - THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1, - YOU_ALREADY_SPECIFIED_THIS_STRIDE=1, - INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1, - THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1, - PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1, - THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1, - YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1, - YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1, - THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1, - YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1, - THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1, - THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1, - THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1, - YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1, - YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1, - THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1, - THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1, - OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1, - IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1, - STORAGE_LAYOUT_DOES_NOT_MATCH=1, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1, - THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1, - THIS_TYPE_IS_NOT_SUPPORTED=1, - STORAGE_KIND_MUST_MATCH=1, - STORAGE_INDEX_MUST_MATCH=1, - CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, - SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, - INVALID_TEMPLATE_PARAMETER=1, - GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1, - THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1 - }; - }; - - } // end namespace internal - - } // end namespace Eigen - - // Specialized implementation for MSVC to avoid "conditional - // expression is constant" warnings. This implementation doesn't - // appear to work under GCC, hence the multiple implementations. - #if EIGEN_COMP_MSVC - - #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \ - {Eigen::internal::static_assertion::MSG;} - - #else - // In some cases clang interprets bool(CONDITION) as function declaration - #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \ - if (Eigen::internal::static_assertion(CONDITION)>::MSG) {} - - #endif - - #endif // not CXX0X +#define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); #else // EIGEN_NO_STATIC_ASSERT - #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG); +#define EIGEN_STATIC_ASSERT(CONDITION,MSG) #endif // EIGEN_NO_STATIC_ASSERT #endif // EIGEN_STATIC_ASSERT diff --git a/libs/eigen/Eigen/src/Core/util/SymbolicIndex.h b/libs/eigen/Eigen/src/Core/util/SymbolicIndex.h index 354dd9a..3b19185 100644 --- a/libs/eigen/Eigen/src/Core/util/SymbolicIndex.h +++ b/libs/eigen/Eigen/src/Core/util/SymbolicIndex.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SYMBOLIC_INDEX_H #define EIGEN_SYMBOLIC_INDEX_H +#include "../InternalHeaderCheck.h" + namespace Eigen { /** \namespace Eigen::symbolic @@ -30,12 +32,11 @@ namespace Eigen { * // And evaluate it: (c++14) * std::cout << expr.eval(x=6,y=3,z=-13) << "\n"; * - * // In c++98/11, only one symbol per expression is supported for now: - * auto expr98 = (3-x)/2; - * std::cout << expr98.eval(x=6) << "\n"; * \endcode * - * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN. + * It is currently only used internally to define and manipulate the + * Eigen::placeholders::last and Eigen::placeholders::lastp1 symbols in + * Eigen::seq and Eigen::seqN. * */ namespace symbolic { @@ -88,10 +89,8 @@ public: template Index eval(const T& values) const { return derived().eval_impl(values); } -#if EIGEN_HAS_CXX14 template Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); } -#endif NegateExpr operator-() const { return NegateExpr(derived()); } @@ -139,34 +138,6 @@ public: friend QuotientExpr >,Derived> operator/(internal::FixedInt, const BaseExpr& b) { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } -#if (!EIGEN_HAS_CXX14) - template - AddExpr > > operator+(internal::FixedInt (*)()) const - { return AddExpr > >(derived(), ValueExpr >()); } - template - AddExpr > > operator-(internal::FixedInt (*)()) const - { return AddExpr > >(derived(), ValueExpr >()); } - template - ProductExpr > > operator*(internal::FixedInt (*)()) const - { return ProductExpr > >(derived(),ValueExpr >()); } - template - QuotientExpr > > operator/(internal::FixedInt (*)()) const - { return QuotientExpr > >(derived(),ValueExpr >()); } - - template - friend AddExpr > > operator+(internal::FixedInt (*)(), const BaseExpr& b) - { return AddExpr > >(b.derived(), ValueExpr >()); } - template - friend AddExpr,ValueExpr > > operator-(internal::FixedInt (*)(), const BaseExpr& b) - { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } - template - friend ProductExpr >,Derived> operator*(internal::FixedInt (*)(), const BaseExpr& b) - { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } - template - friend QuotientExpr >,Derived> operator/(internal::FixedInt (*)(), const BaseExpr& b) - { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } -#endif - template AddExpr operator+(const BaseExpr &b) const @@ -228,11 +199,9 @@ public: Index eval_impl(const SymbolValue &values) const { return values.value(); } -#if EIGEN_HAS_CXX14 // C++14 versions suitable for multiple symbols template Index eval_impl(const std::tuple& values) const { return std::get >(values).value(); } -#endif }; template diff --git a/libs/eigen/Eigen/src/Core/util/XprHelper.h b/libs/eigen/Eigen/src/Core/util/XprHelper.h index 71c32b8..b5f91bf 100644 --- a/libs/eigen/Eigen/src/Core/util/XprHelper.h +++ b/libs/eigen/Eigen/src/Core/util/XprHelper.h @@ -11,16 +11,7 @@ #ifndef EIGEN_XPRHELPER_H #define EIGEN_XPRHELPER_H -// just a workaround because GCC seems to not really like empty structs -// FIXME: gcc 4.3 generates bad code when strict-aliasing is enabled -// so currently we simply disable this optimization for gcc 4.3 -#if EIGEN_COMP_GNUC && !EIGEN_GNUC_AT(4,3) - #define EIGEN_EMPTY_STRUCT_CTOR(X) \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X() {} \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X(const X& ) {} -#else - #define EIGEN_EMPTY_STRUCT_CTOR(X) -#endif +#include "../InternalHeaderCheck.h" namespace Eigen { @@ -37,15 +28,7 @@ inline IndexDest convert_index(const IndexSrc& idx) { // true if T can be considered as an integral index (i.e., and integral type or enum) template struct is_valid_index_type { - enum { value = -#if EIGEN_HAS_TYPE_TRAITS - internal::is_integral::value || std::is_enum::value -#elif EIGEN_COMP_MSVC - internal::is_integral::value || __is_enum(T) -#else - // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. - internal::is_convertible::value && !internal::is_same::value && !is_same::value -#endif + enum { value = internal::is_integral::value || std::is_enum::value }; }; @@ -119,7 +102,7 @@ class no_assignment_operator template struct promote_index_type { - typedef typename conditional<(sizeof(I1)::type type; + typedef std::conditional_t<(sizeof(I1) type; }; /** \internal If the template parameter Value is Dynamic, this class is just a wrapper around a T variable that @@ -154,7 +137,6 @@ template class variable_if_dynamic template class variable_if_dynamicindex { public: - EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR T value() { return T(Value); } @@ -209,83 +191,68 @@ struct find_best_packet }; #if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -template -struct compute_default_alignment_helper -{ - enum { value = 0 }; -}; - -template -struct compute_default_alignment_helper // Match -{ - enum { value = AlignmentBytes }; -}; - -template -struct compute_default_alignment_helper // Try-half -{ - // current packet too large, try with an half-packet - enum { value = compute_default_alignment_helper::value }; -}; +constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) { + if((ArrayBytes % AlignmentBytes) == 0) { + return AlignmentBytes; + } else if (EIGEN_MIN_ALIGN_BYTES -struct compute_default_alignment_helper -{ - enum { value = 0 }; -}; +// This also avoids a division by zero +constexpr inline int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) { + EIGEN_UNUSED_VARIABLE(ArrayBytes); + EIGEN_UNUSED_VARIABLE(AlignmentBytes); + return 0; +} #endif template struct compute_default_alignment { - enum { value = compute_default_alignment_helper::value }; + enum { value = compute_default_alignment_helper(Size*sizeof(T), EIGEN_MAX_STATIC_ALIGN_BYTES) }; }; template struct compute_default_alignment { enum { value = EIGEN_MAX_ALIGN_BYTES }; }; -template class make_proper_matrix_type { enum { - IsColVector = _Cols==1 && _Rows!=1, - IsRowVector = _Rows==1 && _Cols!=1, - Options = IsColVector ? (_Options | ColMajor) & ~RowMajor - : IsRowVector ? (_Options | RowMajor) & ~ColMajor - : _Options + IsColVector = Cols_==1 && Rows_!=1, + IsRowVector = Rows_==1 && Cols_!=1, + Options = IsColVector ? (Options_ | ColMajor) & ~RowMajor + : IsRowVector ? (Options_ | RowMajor) & ~ColMajor + : Options_ }; public: - typedef Matrix<_Scalar, _Rows, _Cols, Options, _MaxRows, _MaxCols> type; + typedef Matrix type; }; -template -class compute_matrix_flags -{ - enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0 }; - public: - // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<> - // and then propagate this information to the evaluator's flags. - // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage. - enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit }; -}; +constexpr inline unsigned compute_matrix_flags(int Options) { + unsigned row_major_bit = Options&RowMajor ? RowMajorBit : 0; + // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<> + // and then propagate this information to the evaluator's flags. + // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage. + return DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit; +} -template struct size_at_compile_time -{ - enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols }; -}; +constexpr inline int size_at_compile_time(int rows, int cols) { + return (rows==Dynamic || cols==Dynamic) ? Dynamic : rows * cols; +} template struct size_of_xpr_at_compile_time { - enum { ret = size_at_compile_time::RowsAtCompileTime,traits::ColsAtCompileTime>::ret }; + enum { ret = size_at_compile_time(traits::RowsAtCompileTime, traits::ColsAtCompileTime) }; }; /* plain_matrix_type : the difference from eval is that plain_matrix_type is always a plain matrix type, @@ -303,6 +270,11 @@ template struct plain_matrix_type typedef typename T::PlainObject type; }; +template struct plain_matrix_type +{ + typedef typename T::PlainObject type; +}; + template struct plain_matrix_type_dense { typedef Matrix::Scalar, @@ -349,17 +321,22 @@ template struct eval typedef typename plain_matrix_type::type type; }; -// for matrices, no need to evaluate, just use a const reference to avoid a useless copy -template -struct eval, Dense> +template struct eval { - typedef const Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type; + typedef typename plain_matrix_type::type type; }; -template -struct eval, Dense> +// for matrices, no need to evaluate, just use a const reference to avoid a useless copy +template +struct eval, Dense> { - typedef const Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type; + typedef const Matrix& type; +}; + +template +struct eval, Dense> +{ + typedef const Array& type; }; @@ -415,28 +392,28 @@ template struct plain_matrix_type_row_major template struct ref_selector { - typedef typename conditional< + typedef std::conditional_t< bool(traits::Flags & NestByRefBit), T const&, const T - >::type type; + > type; - typedef typename conditional< + typedef std::conditional_t< bool(traits::Flags & NestByRefBit), T &, T - >::type non_const_type; + > non_const_type; }; /** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */ template struct transfer_constness { - typedef typename conditional< + typedef std::conditional_t< bool(internal::is_const::value), - typename internal::add_const_on_value_type::type, + add_const_on_value_type_t, T2 - >::type type; + > type; }; @@ -469,7 +446,7 @@ template Evaluate = (int(evaluator::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval)) }; - typedef typename conditional::type>::type type; + typedef std::conditional_t::type> type; }; template @@ -509,10 +486,10 @@ struct generic_xpr_base template struct cast_return_type { typedef typename XprType::Scalar CurrentScalarType; - typedef typename remove_all::type _CastType; - typedef typename _CastType::Scalar NewScalarType; - typedef typename conditional::value, - const XprType&,CastType>::type type; + typedef remove_all_t CastType_; + typedef typename CastType_::Scalar NewScalarType; + typedef std::conditional_t::value, + const XprType&,CastType> type; }; template struct promote_storage_type; @@ -587,6 +564,12 @@ template struct product_promote_storage_type struct product_promote_storage_type { typedef Dense ret; }; template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef A ret; }; +template struct product_promote_storage_type { typedef B ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; + template struct product_promote_storage_type { typedef A ret; }; template struct product_promote_storage_type { typedef B ret; }; template struct product_promote_storage_type { typedef Dense ret; }; @@ -603,11 +586,11 @@ struct plain_row_type typedef Array ArrayRowType; - typedef typename conditional< + typedef std::conditional_t< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixRowType, ArrayRowType - >::type type; + > type; }; template @@ -618,27 +601,28 @@ struct plain_col_type typedef Array ArrayColType; - typedef typename conditional< + typedef std::conditional_t< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixColType, ArrayColType - >::type type; + > type; }; template struct plain_diag_type { - enum { diag_size = EIGEN_SIZE_MIN_PREFER_DYNAMIC(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime), - max_diag_size = EIGEN_SIZE_MIN_PREFER_FIXED(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime) + enum { diag_size = internal::min_size_prefer_dynamic(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime), + max_diag_size = min_size_prefer_fixed(ExpressionType::MaxRowsAtCompileTime, + ExpressionType::MaxColsAtCompileTime) }; typedef Matrix MatrixDiagType; typedef Array ArrayDiagType; - typedef typename conditional< + typedef std::conditional_t< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixDiagType, ArrayDiagType - >::type type; + > type; }; template @@ -652,7 +636,7 @@ struct plain_constant_type typedef Matrix::RowsAtCompileTime, traits::ColsAtCompileTime, Options, traits::MaxRowsAtCompileTime,traits::MaxColsAtCompileTime> matrix_type; - typedef CwiseNullaryOp, const typename conditional::XprKind, MatrixXpr >::value, matrix_type, array_type>::type > type; + typedef CwiseNullaryOp, const std::conditional_t::XprKind, MatrixXpr >::value, matrix_type, array_type> > type; }; template @@ -692,14 +676,14 @@ struct possibly_same_dense { template EIGEN_DEVICE_FUNC -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) +bool is_same_dense(const T1 &mat1, const T2 &mat2, std::enable_if_t::value> * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template EIGEN_DEVICE_FUNC -bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) +bool is_same_dense(const T1 &, const T2 &, std::enable_if_t::value> * = 0) { return false; } @@ -721,9 +705,9 @@ struct scalar_div_cost, Vectorized> { template -struct scalar_div_cost::type> { enum { value = 24 }; }; +struct scalar_div_cost> { enum { value = 24 }; }; template -struct scalar_div_cost::type> { enum { value = 21 }; }; +struct scalar_div_cost> { enum { value = 21 }; }; #ifdef EIGEN_DEBUG_ASSIGN @@ -812,12 +796,12 @@ struct ScalarBinaryOpTraits }; template -struct ScalarBinaryOpTraits::IsComplex,T>::type>::Real, BinaryOp> +struct ScalarBinaryOpTraits::IsComplex,T>>::Real, BinaryOp> { typedef T ReturnType; }; template -struct ScalarBinaryOpTraits::IsComplex,T>::type>::Real, T, BinaryOp> +struct ScalarBinaryOpTraits::IsComplex,T>>::Real, T, BinaryOp> { typedef T ReturnType; }; diff --git a/libs/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/libs/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h index 081e918..1cfc0ca 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/libs/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -14,6 +14,8 @@ #include "./ComplexSchur.h" +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \eigenvalues_module \ingroup Eigenvalues_Module @@ -23,7 +25,7 @@ namespace Eigen { * * \brief Computes eigenvalues and eigenvectors of general complex matrices * - * \tparam _MatrixType the type of the matrix of which we are + * \tparam MatrixType_ the type of the matrix of which we are * computing the eigendecomposition; this is expected to be an * instantiation of the Matrix class template. * @@ -42,12 +44,12 @@ namespace Eigen { * * \sa class EigenSolver, class SelfAdjointEigenSolver */ -template class ComplexEigenSolver +template class ComplexEigenSolver { public: - /** \brief Synonym for the template parameter \p _MatrixType. */ - typedef _MatrixType MatrixType; + /** \brief Synonym for the template parameter \p MatrixType_. */ + typedef MatrixType_ MatrixType; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -236,12 +238,9 @@ template class ComplexEigenSolver } protected: - - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } - + + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + EigenvectorType m_eivec; EigenvalueType m_eivalues; ComplexSchur m_schur; @@ -260,8 +259,6 @@ template ComplexEigenSolver& ComplexEigenSolver::compute(const EigenBase& matrix, bool computeEigenvectors) { - check_template_parameters(); - // this code is inspired from Jampack eigen_assert(matrix.cols() == matrix.rows()); diff --git a/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur.h b/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur.h index fc71468..80a28fb 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur.h @@ -14,6 +14,8 @@ #include "./HessenbergDecomposition.h" +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -27,7 +29,7 @@ template struct complex_schur_reduce_to_hes * * \brief Performs a complex Schur decomposition of a real or complex square matrix * - * \tparam _MatrixType the type of the matrix of which we are + * \tparam MatrixType_ the type of the matrix of which we are * computing the Schur decomposition; this is expected to be an * instantiation of the Matrix class template. * @@ -48,10 +50,10 @@ template struct complex_schur_reduce_to_hes * * \sa class RealSchur, class EigenSolver, class ComplexEigenSolver */ -template class ComplexSchur +template class ComplexSchur { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -60,12 +62,12 @@ template class ComplexSchur MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - /** \brief Scalar type for matrices of type \p _MatrixType. */ + /** \brief Scalar type for matrices of type \p MatrixType_. */ typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - /** \brief Complex scalar type for \p _MatrixType. + /** \brief Complex scalar type for \p MatrixType_. * * This is \c std::complex if #Scalar is real (e.g., * \c float or \c double) and just \c Scalar if #Scalar is @@ -76,7 +78,7 @@ template class ComplexSchur /** \brief Type for the matrices in the Schur decomposition. * * This is a square matrix with entries of type #ComplexScalar. - * The size is the same as the size of \p _MatrixType. + * The size is the same as the size of \p MatrixType_. */ typedef Matrix ComplexMatrixType; @@ -259,7 +261,7 @@ template class ComplexSchur friend struct internal::complex_schur_reduce_to_hessenberg::IsComplex>; }; -/** If m_matT(i+1,i) is neglegible in floating point arithmetic +/** If m_matT(i+1,i) is negligible in floating point arithmetic * compared to m_matT(i,i) and m_matT(j,j), then set it to zero and * return true, else return false. */ template @@ -306,7 +308,7 @@ typename ComplexSchur::ComplexScalar ComplexSchur::compu // In this case, det==0, and all we have to do is checking that eival2_norm!=0 if(eival1_norm > eival2_norm) eival2 = det / eival1; - else if(eival2_norm!=RealScalar(0)) + else if(!numext::is_exactly_zero(eival2_norm)) eival1 = det / eival2; // choose the eigenvalue closest to the bottom entry of the diagonal diff --git a/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h b/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h index 4980a3e..144eb2a 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +++ b/libs/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h @@ -33,6 +33,8 @@ #ifndef EIGEN_COMPLEX_SCHUR_LAPACKE_H #define EIGEN_COMPLEX_SCHUR_LAPACKE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ diff --git a/libs/eigen/Eigen/src/Eigenvalues/EigenSolver.h b/libs/eigen/Eigen/src/Eigenvalues/EigenSolver.h index 572b29e..f6ff140 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/EigenSolver.h +++ b/libs/eigen/Eigen/src/Eigenvalues/EigenSolver.h @@ -13,6 +13,8 @@ #include "./RealSchur.h" +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \eigenvalues_module \ingroup Eigenvalues_Module @@ -22,7 +24,7 @@ namespace Eigen { * * \brief Computes eigenvalues and eigenvectors of general matrices * - * \tparam _MatrixType the type of the matrix of which we are computing the + * \tparam MatrixType_ the type of the matrix of which we are computing the * eigendecomposition; this is expected to be an instantiation of the Matrix * class template. Currently, only real matrices are supported. * @@ -61,12 +63,12 @@ namespace Eigen { * * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver */ -template class EigenSolver +template class EigenSolver { public: - /** \brief Synonym for the template parameter \p _MatrixType. */ - typedef _MatrixType MatrixType; + /** \brief Synonym for the template parameter \p MatrixType_. */ + typedef MatrixType_ MatrixType; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, diff --git a/libs/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/libs/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 87d789b..d62c411 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/libs/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -14,6 +14,8 @@ #include "./RealQZ.h" +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \eigenvalues_module \ingroup Eigenvalues_Module @@ -23,7 +25,7 @@ namespace Eigen { * * \brief Computes the generalized eigenvalues and eigenvectors of a pair of general matrices * - * \tparam _MatrixType the type of the matrices of which we are computing the + * \tparam MatrixType_ the type of the matrices of which we are computing the * eigen-decomposition; this is expected to be an instantiation of the Matrix * class template. Currently, only real matrices are supported. * @@ -55,12 +57,12 @@ namespace Eigen { * * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver */ -template class GeneralizedEigenSolver +template class GeneralizedEigenSolver { public: - /** \brief Synonym for the template parameter \p _MatrixType. */ - typedef _MatrixType MatrixType; + /** \brief Synonym for the template parameter \p MatrixType_. */ + typedef MatrixType_ MatrixType; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -119,8 +121,8 @@ template class GeneralizedEigenSolver : m_eivec(), m_alphas(), m_betas(), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ() {} @@ -134,8 +136,8 @@ template class GeneralizedEigenSolver : m_eivec(size, size), m_alphas(size), m_betas(size), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(size), m_tmp(size) {} @@ -156,8 +158,8 @@ template class GeneralizedEigenSolver : m_eivec(A.rows(), A.cols()), m_alphas(A.cols()), m_betas(A.cols()), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(A.cols()), m_tmp(A.cols()) { @@ -177,7 +179,8 @@ template class GeneralizedEigenSolver * \sa eigenvalues() */ EigenvectorsType eigenvectors() const { - eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors"); + eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated"); return m_eivec; } @@ -201,7 +204,7 @@ template class GeneralizedEigenSolver */ EigenvalueType eigenvalues() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvalues."); return EigenvalueType(m_alphas,m_betas); } @@ -210,9 +213,9 @@ template class GeneralizedEigenSolver * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j). * * \sa betas(), eigenvalues() */ - ComplexVectorType alphas() const + const ComplexVectorType& alphas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute alphas."); return m_alphas; } @@ -221,9 +224,9 @@ template class GeneralizedEigenSolver * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j). * * \sa alphas(), eigenvalues() */ - VectorType betas() const + const VectorType& betas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute betas."); return m_betas; } @@ -254,7 +257,7 @@ template class GeneralizedEigenSolver ComputationInfo info() const { - eigen_assert(m_valuesOkay && "EigenSolver is not initialized."); + eigen_assert(m_isInitialized && "EigenSolver is not initialized."); return m_realQZ.info(); } @@ -267,17 +270,15 @@ template class GeneralizedEigenSolver } protected: - - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - EIGEN_STATIC_ASSERT(!NumTraits::IsComplex, NUMERIC_TYPE_MUST_BE_REAL); - } - + + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + EIGEN_STATIC_ASSERT(!NumTraits::IsComplex, NUMERIC_TYPE_MUST_BE_REAL) + EigenvectorsType m_eivec; ComplexVectorType m_alphas; VectorType m_betas; - bool m_valuesOkay, m_vectorsOkay; + bool m_computeEigenvectors; + bool m_isInitialized; RealQZ m_realQZ; ComplexVectorType m_tmp; }; @@ -286,14 +287,10 @@ template GeneralizedEigenSolver& GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors) { - check_template_parameters(); - using std::sqrt; using std::abs; eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows()); Index size = A.cols(); - m_valuesOkay = false; - m_vectorsOkay = false; // Reduce to generalized real Schur form: // A = Q S Z and B = Q T Z m_realQZ.compute(A, B, computeEigenvectors); @@ -406,10 +403,9 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp i += 2; } } - - m_valuesOkay = true; - m_vectorsOkay = computeEigenvectors; } + m_computeEigenvectors = computeEigenvectors; + m_isInitialized = true; return *this; } diff --git a/libs/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/libs/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index d0f9091..dab66ca 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/libs/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -13,6 +13,8 @@ #include "./Tridiagonalization.h" +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \eigenvalues_module \ingroup Eigenvalues_Module @@ -22,7 +24,7 @@ namespace Eigen { * * \brief Computes eigenvalues and eigenvectors of the generalized selfadjoint eigen problem * - * \tparam _MatrixType the type of the matrix of which we are computing the + * \tparam MatrixType_ the type of the matrix of which we are computing the * eigendecomposition; this is expected to be an instantiation of the Matrix * class template. * @@ -44,19 +46,19 @@ namespace Eigen { * * \sa class SelfAdjointEigenSolver, class EigenSolver, class ComplexEigenSolver */ -template -class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixType> +template +class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver { - typedef SelfAdjointEigenSolver<_MatrixType> Base; + typedef SelfAdjointEigenSolver Base; public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; /** \brief Default constructor for fixed-size matrices. * * The default constructor is useful in cases in which the user intends to * perform decompositions via compute(). This constructor - * can only be used if \p _MatrixType is a fixed-size matrix; use + * can only be used if \p MatrixType_ is a fixed-size matrix; use * GeneralizedSelfAdjointEigenSolver(Index) for dynamic-size matrices. */ GeneralizedSelfAdjointEigenSolver() : Base() {} diff --git a/libs/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/libs/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h index 1f21139..fafab99 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/libs/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -11,6 +11,8 @@ #ifndef EIGEN_HESSENBERGDECOMPOSITION_H #define EIGEN_HESSENBERGDECOMPOSITION_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -31,7 +33,7 @@ struct traits > * * \brief Reduces a square matrix to Hessenberg form by an orthogonal similarity transformation * - * \tparam _MatrixType the type of the matrix of which we are computing the Hessenberg decomposition + * \tparam MatrixType_ the type of the matrix of which we are computing the Hessenberg decomposition * * This class performs an Hessenberg decomposition of a matrix \f$ A \f$. In * the real case, the Hessenberg decomposition consists of an orthogonal @@ -54,12 +56,12 @@ struct traits > * * \sa class ComplexSchur, class Tridiagonalization, \ref QR_Module "QR Module" */ -template class HessenbergDecomposition +template class HessenbergDecomposition { public: - /** \brief Synonym for the template parameter \p _MatrixType. */ - typedef _MatrixType MatrixType; + /** \brief Synonym for the template parameter \p MatrixType_. */ + typedef MatrixType_ MatrixType; enum { Size = MatrixType::RowsAtCompileTime, @@ -82,7 +84,7 @@ template class HessenbergDecomposition typedef Matrix CoeffVectorType; /** \brief Return type of matrixQ() */ - typedef HouseholderSequence::type> HouseholderSequenceType; + typedef HouseholderSequence> HouseholderSequenceType; typedef internal::HessenbergDecompositionMatrixHReturnType MatrixHReturnType; diff --git a/libs/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h b/libs/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h new file mode 100644 index 0000000..374cbd4 --- /dev/null +++ b/libs/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_EIGENVALUES_MODULE_H +#error "Please include Eigen/Eigenvalues instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/libs/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 66e5a3d..c8df260 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/libs/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -11,6 +11,8 @@ #ifndef EIGEN_MATRIXBASEEIGENVALUES_H #define EIGEN_MATRIXBASEEIGENVALUES_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Eigenvalues/RealQZ.h b/libs/eigen/Eigen/src/Eigenvalues/RealQZ.h index 5091301..545918f 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/RealQZ.h +++ b/libs/eigen/Eigen/src/Eigenvalues/RealQZ.h @@ -10,6 +10,8 @@ #ifndef EIGEN_REAL_QZ_H #define EIGEN_REAL_QZ_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \eigenvalues_module \ingroup Eigenvalues_Module @@ -19,7 +21,7 @@ namespace Eigen { * * \brief Performs a real QZ decomposition of a pair of square matrices * - * \tparam _MatrixType the type of the matrix of which we are computing the + * \tparam MatrixType_ the type of the matrix of which we are computing the * real QZ decomposition; this is expected to be an instantiation of the * Matrix class template. * @@ -54,10 +56,10 @@ namespace Eigen { * \sa class RealSchur, class ComplexSchur, class EigenSolver, class ComplexEigenSolver */ - template class RealQZ + template class RealQZ { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -237,7 +239,7 @@ namespace Eigen { for (Index i=dim-1; i>=j+2; i--) { JRs G; // kill S(i,j) - if(m_S.coeff(i,j) != 0) + if(!numext::is_exactly_zero(m_S.coeff(i, j))) { G.makeGivens(m_S.coeff(i-1,j), m_S.coeff(i,j), &m_S.coeffRef(i-1, j)); m_S.coeffRef(i,j) = Scalar(0.0); @@ -248,7 +250,7 @@ namespace Eigen { m_Q.applyOnTheRight(i-1,i,G); } // kill T(i,i-1) - if(m_T.coeff(i,i-1)!=Scalar(0)) + if(!numext::is_exactly_zero(m_T.coeff(i, i - 1))) { G.makeGivens(m_T.coeff(i,i), m_T.coeff(i,i-1), &m_T.coeffRef(i,i)); m_T.coeffRef(i,i-1) = Scalar(0.0); @@ -286,7 +288,7 @@ namespace Eigen { while (res > 0) { Scalar s = abs(m_S.coeff(res-1,res-1)) + abs(m_S.coeff(res,res)); - if (s == Scalar(0.0)) + if (numext::is_exactly_zero(s)) s = m_normOfS; if (abs(m_S.coeff(res,res-1)) < NumTraits::epsilon() * s) break; @@ -316,7 +318,7 @@ namespace Eigen { using std::abs; using std::sqrt; const Index dim=m_S.cols(); - if (abs(m_S.coeff(i+1,i))==Scalar(0)) + if (numext::is_exactly_zero(abs(m_S.coeff(i + 1, i)))) return; Index j = findSmallDiagEntry(i,i+1); if (j==i-1) @@ -627,7 +629,7 @@ namespace Eigen { { for(Index i=0; i j_left, j_right; internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right); diff --git a/libs/eigen/Eigen/src/Eigenvalues/RealSchur.h b/libs/eigen/Eigen/src/Eigenvalues/RealSchur.h index 7304ef3..9817666 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/RealSchur.h +++ b/libs/eigen/Eigen/src/Eigenvalues/RealSchur.h @@ -13,6 +13,8 @@ #include "./HessenbergDecomposition.h" +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \eigenvalues_module \ingroup Eigenvalues_Module @@ -22,7 +24,7 @@ namespace Eigen { * * \brief Performs a real Schur decomposition of a square matrix * - * \tparam _MatrixType the type of the matrix of which we are computing the + * \tparam MatrixType_ the type of the matrix of which we are computing the * real Schur decomposition; this is expected to be an instantiation of the * Matrix class template. * @@ -51,10 +53,10 @@ namespace Eigen { * * \sa class ComplexSchur, class EigenSolver, class ComplexEigenSolver */ -template class RealSchur +template class RealSchur { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -312,7 +314,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Scalar considerAsZero = numext::maxi( norm * numext::abs2(NumTraits::epsilon()), (std::numeric_limits::min)() ); - if(norm!=Scalar(0)) + if(!numext::is_exactly_zero(norm)) { while (iu >= 0) { @@ -515,7 +517,7 @@ inline void RealSchur::performFrancisQRStep(Index il, Index im, Inde Matrix ess; v.makeHouseholder(ess, tau, beta); - if (beta != Scalar(0)) // if v is not zero + if (!numext::is_exactly_zero(beta)) // if v is not zero { if (firstIteration && k > il) m_matT.coeffRef(k,k-1) = -m_matT.coeff(k,k-1); @@ -535,7 +537,7 @@ inline void RealSchur::performFrancisQRStep(Index il, Index im, Inde Matrix ess; v.makeHouseholder(ess, tau, beta); - if (beta != Scalar(0)) // if v is not zero + if (!numext::is_exactly_zero(beta)) // if v is not zero { m_matT.coeffRef(iu-1, iu-2) = beta; m_matT.block(iu-1, iu-1, 2, size-iu+1).applyHouseholderOnTheLeft(ess, tau, workspace); diff --git a/libs/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h b/libs/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h index 2c22517..0a6ed21 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +++ b/libs/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h @@ -33,6 +33,8 @@ #ifndef EIGEN_REAL_SCHUR_LAPACKE_H #define EIGEN_REAL_SCHUR_LAPACKE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ diff --git a/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 1469236..d196ec0 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -13,9 +13,11 @@ #include "./Tridiagonalization.h" +#include "./InternalHeaderCheck.h" + namespace Eigen { -template +template class GeneralizedSelfAdjointEigenSolver; namespace internal { @@ -33,7 +35,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag * * \brief Computes eigenvalues and eigenvectors of selfadjoint matrices * - * \tparam _MatrixType the type of the matrix of which we are computing the + * \tparam MatrixType_ the type of the matrix of which we are computing the * eigendecomposition; this is expected to be an instantiation of the Matrix * class template. * @@ -73,11 +75,11 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag * * \sa MatrixBase::eigenvalues(), class EigenSolver, class ComplexEigenSolver */ -template class SelfAdjointEigenSolver +template class SelfAdjointEigenSolver { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; enum { Size = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -85,13 +87,13 @@ template class SelfAdjointEigenSolver MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - /** \brief Scalar type for matrices of type \p _MatrixType. */ + /** \brief Scalar type for matrices of type \p MatrixType_. */ typedef typename MatrixType::Scalar Scalar; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 typedef Matrix EigenvectorsType; - /** \brief Real scalar type for \p _MatrixType. + /** \brief Real scalar type for \p MatrixType_. * * This is just \c Scalar if #Scalar is real (e.g., \c float or * \c double), and the type of the real part of \c Scalar if #Scalar is @@ -104,7 +106,7 @@ template class SelfAdjointEigenSolver /** \brief Type for vector of eigenvalues as returned by eigenvalues(). * * This is a column vector with entries of type #RealScalar. - * The length of the vector is the size of \p _MatrixType. + * The length of the vector is the size of \p MatrixType_. */ typedef typename internal::plain_col_type::type RealVectorType; typedef Tridiagonalization TridiagonalizationType; @@ -114,7 +116,7 @@ template class SelfAdjointEigenSolver * * The default constructor is useful in cases in which the user intends to * perform decompositions via compute(). This constructor - * can only be used if \p _MatrixType is a fixed-size matrix; use + * can only be used if \p MatrixType_ is a fixed-size matrix; use * SelfAdjointEigenSolver(Index) for dynamic-size matrices. * * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp @@ -372,12 +374,8 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static EIGEN_DEVICE_FUNC - void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } - + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + EigenvectorsType m_eivec; RealVectorType m_eivalues; typename TridiagonalizationType::SubDiagonalType m_subdiag; @@ -419,10 +417,8 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver& SelfAdjointEigenSolver ::compute(const EigenBase& a_matrix, int options) { - check_template_parameters(); - const InputType &matrix(a_matrix.derived()); - + EIGEN_USING_STD(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 @@ -451,7 +447,7 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver // map the matrix coefficients to [-1:1] to avoid over- and underflow. mat = matrix.template triangularView(); RealScalar scale = mat.cwiseAbs().maxCoeff(); - if(scale==RealScalar(0)) scale = RealScalar(1); + if(numext::is_exactly_zero(scale)) scale = RealScalar(1); mat.template triangularView() /= scale; m_subdiag.resize(n-1); m_hcoeffs.resize(n-1); @@ -530,7 +526,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag } // find the largest unreduced block at the end of the matrix. - while (end>0 && subdiag[end-1]==RealScalar(0)) + while (end>0 && numext::is_exactly_zero(subdiag[end - 1])) { end--; } @@ -542,7 +538,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag if(iter > maxIterations * n) break; start = end - 1; - while (start>0 && subdiag[start-1]!=0) + while (start>0 && !numext::is_exactly_zero(subdiag[start - 1])) start--; internal::tridiagonal_qr_step(diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n); @@ -847,12 +843,12 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta // RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2)); // This explain the following, somewhat more complicated, version: RealScalar mu = diag[end]; - if(td==RealScalar(0)) { + if(numext::is_exactly_zero(td)) { mu -= numext::abs(e); - } else if (e != RealScalar(0)) { + } else if (!numext::is_exactly_zero(e)) { const RealScalar e2 = numext::abs2(e); const RealScalar h = numext::hypot(td,e); - if(e2 == RealScalar(0)) { + if(numext::is_exactly_zero(e2)) { mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e); } else { mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); @@ -863,7 +859,7 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta RealScalar z = subdiag[start]; // If z ever becomes zero, the Givens rotation will be the identity and // z will stay zero for all future iterations. - for (Index k = start; k < end && z != RealScalar(0); ++k) + for (Index k = start; k < end && !numext::is_exactly_zero(z); ++k) { JacobiRotation rot; rot.makeGivens(x, z); diff --git a/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h index b0c947d..b24de67 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +++ b/libs/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h @@ -33,6 +33,8 @@ #ifndef EIGEN_SAEIGENSOLVER_LAPACKE_H #define EIGEN_SAEIGENSOLVER_LAPACKE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ diff --git a/libs/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h b/libs/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h index 674c92a..9b002fe 100644 --- a/libs/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/libs/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -11,6 +11,8 @@ #ifndef EIGEN_TRIDIAGONALIZATION_H #define EIGEN_TRIDIAGONALIZATION_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -36,7 +38,7 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); * * \brief Tridiagonal decomposition of a selfadjoint matrix * - * \tparam _MatrixType the type of the matrix of which we are computing the + * \tparam MatrixType_ the type of the matrix of which we are computing the * tridiagonal decomposition; this is expected to be an instantiation of the * Matrix class template. * @@ -61,12 +63,12 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); * * \sa class HessenbergDecomposition, class SelfAdjointEigenSolver */ -template class Tridiagonalization +template class Tridiagonalization { public: - /** \brief Synonym for the template parameter \p _MatrixType. */ - typedef _MatrixType MatrixType; + /** \brief Synonym for the template parameter \p MatrixType_. */ + typedef MatrixType_ MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -83,21 +85,21 @@ template class Tridiagonalization typedef Matrix CoeffVectorType; typedef typename internal::plain_col_type::type DiagonalType; typedef Matrix SubDiagonalType; - typedef typename internal::remove_all::type MatrixTypeRealView; + typedef internal::remove_all_t MatrixTypeRealView; typedef internal::TridiagonalizationMatrixTReturnType MatrixTReturnType; - typedef typename internal::conditional::IsComplex, - typename internal::add_const_on_value_type::RealReturnType>::type, + typedef std::conditional_t::IsComplex, + internal::add_const_on_value_type_t::RealReturnType>, const Diagonal - >::type DiagonalReturnType; + > DiagonalReturnType; - typedef typename internal::conditional::IsComplex, - typename internal::add_const_on_value_type::RealReturnType>::type, + typedef std::conditional_t::IsComplex, + internal::add_const_on_value_type_t::RealReturnType>, const Diagonal - >::type SubDiagonalReturnType; + > SubDiagonalReturnType; /** \brief Return type of matrixQ() */ - typedef HouseholderSequence::type> HouseholderSequenceType; + typedef HouseholderSequence> HouseholderSequenceType; /** \brief Default constructor. * @@ -440,9 +442,8 @@ void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonal template struct tridiagonalization_inplace_selector { - typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; - template + template static EIGEN_DEVICE_FUNC void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { diff --git a/libs/eigen/Eigen/src/Geometry/AlignedBox.h b/libs/eigen/Eigen/src/Geometry/AlignedBox.h index 55a9d0a..a824817 100644 --- a/libs/eigen/Eigen/src/Geometry/AlignedBox.h +++ b/libs/eigen/Eigen/src/Geometry/AlignedBox.h @@ -46,6 +46,8 @@ #ifndef EIGEN_ALIGNEDBOX_H #define EIGEN_ALIGNEDBOX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -55,20 +57,20 @@ namespace Eigen { * * \brief An axis aligned box * - * \tparam _Scalar the type of the scalar coefficients - * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. + * \tparam Scalar_ the type of the scalar coefficients + * \tparam AmbientDim_ the dimension of the ambient space, can be a compile time value or Dynamic. * * This class represents an axis aligned box as a pair of the minimal and maximal corners. * \warning The result of most methods is undefined when applied to an empty box. You can check for empty boxes using isEmpty(). * \sa alignedboxtypedefs */ -template +template class AlignedBox { public: -EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) - enum { AmbientDimAtCompileTime = _AmbientDim }; - typedef _Scalar Scalar; +EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_,AmbientDim_) + enum { AmbientDimAtCompileTime = AmbientDim_ }; + typedef Scalar_ Scalar; typedef NumTraits ScalarTraits; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 typedef typename ScalarTraits::Real RealScalar; @@ -181,7 +183,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) */ EIGEN_DEVICE_FUNC inline VectorType corner(CornerType corner) const { - EIGEN_STATIC_ASSERT(_AmbientDim <= 3, THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE); + EIGEN_STATIC_ASSERT(AmbientDim_ <= 3, THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE); VectorType res; diff --git a/libs/eigen/Eigen/src/Geometry/AngleAxis.h b/libs/eigen/Eigen/src/Geometry/AngleAxis.h index 78328b6..c23a908 100644 --- a/libs/eigen/Eigen/src/Geometry/AngleAxis.h +++ b/libs/eigen/Eigen/src/Geometry/AngleAxis.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ANGLEAXIS_H #define EIGEN_ANGLEAXIS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -18,7 +20,7 @@ namespace Eigen { * * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis * - * \param _Scalar the scalar type, i.e., the type of the coefficients. + * \param Scalar_ the scalar type, i.e., the type of the coefficients. * * \warning When setting up an AngleAxis object, the axis vector \b must \b be \b normalized. * @@ -39,16 +41,16 @@ namespace Eigen { */ namespace internal { -template struct traits > +template struct traits > { - typedef _Scalar Scalar; + typedef Scalar_ Scalar; }; } -template -class AngleAxis : public RotationBase,3> +template +class AngleAxis : public RotationBase,3> { - typedef RotationBase,3> Base; + typedef RotationBase,3> Base; public: @@ -56,7 +58,7 @@ public: enum { Dim = 3 }; /** the scalar type of the coefficients */ - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Matrix Matrix3; typedef Matrix Vector3; typedef Quaternion QuaternionType; diff --git a/libs/eigen/Eigen/src/Geometry/EulerAngles.h b/libs/eigen/Eigen/src/Geometry/EulerAngles.h index 19b734c..2b99960 100644 --- a/libs/eigen/Eigen/src/Geometry/EulerAngles.h +++ b/libs/eigen/Eigen/src/Geometry/EulerAngles.h @@ -10,6 +10,8 @@ #ifndef EIGEN_EULERANGLES_H #define EIGEN_EULERANGLES_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module diff --git a/libs/eigen/Eigen/src/Geometry/Homogeneous.h b/libs/eigen/Eigen/src/Geometry/Homogeneous.h index 94083ac..538cf83 100644 --- a/libs/eigen/Eigen/src/Geometry/Homogeneous.h +++ b/libs/eigen/Eigen/src/Geometry/Homogeneous.h @@ -10,6 +10,8 @@ #ifndef EIGEN_HOMOGENEOUS_H #define EIGEN_HOMOGENEOUS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -35,7 +37,7 @@ struct traits > { typedef typename traits::StorageKind StorageKind; typedef typename ref_selector::type MatrixTypeNested; - typedef typename remove_reference::type _MatrixTypeNested; + typedef std::remove_reference_t MatrixTypeNested_; enum { RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ? int(MatrixType::RowsAtCompileTime) + 1 : Dynamic, @@ -45,7 +47,7 @@ struct traits > ColsAtCompileTime = Direction==Horizontal ? ColsPlusOne : MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = RowsAtCompileTime, MaxColsAtCompileTime = ColsAtCompileTime, - TmpFlags = _MatrixTypeNested::Flags & HereditaryBits, + TmpFlags = MatrixTypeNested_::Flags & HereditaryBits, Flags = ColsAtCompileTime==1 ? (TmpFlags & ~RowMajorBit) : RowsAtCompileTime==1 ? (TmpFlags | RowMajorBit) : TmpFlags @@ -57,13 +59,13 @@ template struct homogeneous_right_product_impl } // end namespace internal -template class Homogeneous - : public MatrixBase >, internal::no_assignment_operator +template class Homogeneous + : public MatrixBase >, internal::no_assignment_operator { public: typedef MatrixType NestedExpression; - enum { Direction = _Direction }; + enum { Direction = Direction_ }; typedef MatrixBase Base; EIGEN_DENSE_PUBLIC_INTERFACE(Homogeneous) @@ -225,7 +227,7 @@ template struct take_matrix_for_product > { typedef Transform TransformType; - typedef typename internal::add_const::type type; + typedef std::add_const_t type; EIGEN_DEVICE_FUNC static type run (const TransformType& x) { return x.affine(); } }; @@ -241,8 +243,8 @@ template struct traits,Lhs> > { typedef typename take_matrix_for_product::type LhsMatrixType; - typedef typename remove_all::type MatrixTypeCleaned; - typedef typename remove_all::type LhsMatrixTypeCleaned; + typedef remove_all_t MatrixTypeCleaned; + typedef remove_all_t LhsMatrixTypeCleaned; typedef typename make_proper_matrix_type< typename traits::Scalar, LhsMatrixTypeCleaned::RowsAtCompileTime, @@ -257,8 +259,8 @@ struct homogeneous_left_product_impl,Lhs> : public ReturnByValue,Lhs> > { typedef typename traits::LhsMatrixType LhsMatrixType; - typedef typename remove_all::type LhsMatrixTypeCleaned; - typedef typename remove_all::type LhsMatrixTypeNested; + typedef remove_all_t LhsMatrixTypeCleaned; + typedef remove_all_t LhsMatrixTypeNested; EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs) : m_lhs(take_matrix_for_product::run(lhs)), m_rhs(rhs) @@ -299,7 +301,7 @@ template struct homogeneous_right_product_impl,Rhs> : public ReturnByValue,Rhs> > { - typedef typename remove_all::type RhsNested; + typedef remove_all_t RhsNested; EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {} @@ -343,7 +345,7 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : Base(), m_temp(op) { - ::new (static_cast(this)) Base(m_temp); + internal::construct_at(this, m_temp); } protected: @@ -402,7 +404,7 @@ struct homogeneous_right_product_refactoring_helper Rows = Lhs::RowsAtCompileTime }; typedef typename Rhs::template ConstNRowsBlockXpr::Type LinearBlockConst; - typedef typename remove_const::type LinearBlock; + typedef std::remove_const_t LinearBlock; typedef typename Rhs::ConstRowXpr ConstantColumn; typedef Replicate ConstantBlock; typedef Product LinearProduct; @@ -455,7 +457,7 @@ struct homogeneous_left_product_refactoring_helper Cols = Rhs::ColsAtCompileTime }; typedef typename Lhs::template ConstNColsBlockXpr::Type LinearBlockConst; - typedef typename remove_const::type LinearBlock; + typedef std::remove_const_t LinearBlock; typedef typename Lhs::ConstColXpr ConstantColumn; typedef Replicate ConstantBlock; typedef Product LinearProduct; diff --git a/libs/eigen/Eigen/src/Geometry/Hyperplane.h b/libs/eigen/Eigen/src/Geometry/Hyperplane.h index cebe035..ad6aae9 100644 --- a/libs/eigen/Eigen/src/Geometry/Hyperplane.h +++ b/libs/eigen/Eigen/src/Geometry/Hyperplane.h @@ -11,6 +11,8 @@ #ifndef EIGEN_HYPERPLANE_H #define EIGEN_HYPERPLANE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -22,24 +24,24 @@ namespace Eigen { * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n. * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane. * - * \tparam _Scalar the scalar type, i.e., the type of the coefficients - * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. - * Notice that the dimension of the hyperplane is _AmbientDim-1. + * \tparam Scalar_ the scalar type, i.e., the type of the coefficients + * \tparam AmbientDim_ the dimension of the ambient space, can be a compile time value or Dynamic. + * Notice that the dimension of the hyperplane is AmbientDim_-1. * * This class represents an hyperplane as the zero set of the implicit equation * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part) * and \f$ d \f$ is the distance (offset) to the origin. */ -template +template class Hyperplane { public: - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_,AmbientDim_==Dynamic ? Dynamic : AmbientDim_+1) enum { - AmbientDimAtCompileTime = _AmbientDim, - Options = _Options + AmbientDimAtCompileTime = AmbientDim_, + Options = Options_ }; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef typename NumTraits::Real RealScalar; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 typedef Matrix VectorType; @@ -106,7 +108,7 @@ public: if(norm <= v0.norm() * v1.norm() * NumTraits::epsilon()) { Matrix m; m << v0.transpose(), v1.transpose(); - JacobiSVD > svd(m, ComputeFullV); + JacobiSVD, ComputeFullV> svd(m); result.normal() = svd.matrixV().col(2); } else diff --git a/libs/eigen/Eigen/src/Geometry/InternalHeaderCheck.h b/libs/eigen/Eigen/src/Geometry/InternalHeaderCheck.h new file mode 100644 index 0000000..a1159a3 --- /dev/null +++ b/libs/eigen/Eigen/src/Geometry/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_GEOMETRY_MODULE_H +#error "Please include Eigen/Geometry instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/Geometry/OrthoMethods.h b/libs/eigen/Eigen/src/Geometry/OrthoMethods.h index 524aebe..fbf020d 100644 --- a/libs/eigen/Eigen/src/Geometry/OrthoMethods.h +++ b/libs/eigen/Eigen/src/Geometry/OrthoMethods.h @@ -11,41 +11,87 @@ #ifndef EIGEN_ORTHOMETHODS_H #define EIGEN_ORTHOMETHODS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { +namespace internal { + +// Vector3 version (default) +template +struct cross_impl +{ + typedef typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType Scalar; + typedef Matrix::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> return_type; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + return_type run(const MatrixBase& first, const MatrixBase& second) + { + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3) + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3) + + // Note that there is no need for an expression here since the compiler + // optimize such a small temporary very well (even within a complex expression) + typename internal::nested_eval::type lhs(first.derived()); + typename internal::nested_eval::type rhs(second.derived()); + return return_type( + numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)), + numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)), + numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0)) + ); + } +}; + +// Vector2 version +template +struct cross_impl +{ + typedef typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType Scalar; + typedef Scalar return_type; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + return_type run(const MatrixBase& first, const MatrixBase& second) + { + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,2); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,2); + typename internal::nested_eval::type lhs(first.derived()); + typename internal::nested_eval::type rhs(second.derived()); + return numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0)); + } +}; + +} // end namespace internal + /** \geometry_module \ingroup Geometry_Module * - * \returns the cross product of \c *this and \a other + * \returns the cross product of \c *this and \a other. This is either a scalar for size-2 vectors or a size-3 vector for size-3 vectors. * - * Here is a very good explanation of cross-product: http://xkcd.com/199/ + * This method is implemented for two different cases: between vectors of fixed size 2 and between vectors of fixed size 3. * - * With complex numbers, the cross product is implemented as - * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} - \mathbf{b} \times \mathbf{c})\f$ + * For vectors of size 3, the output is simply the traditional cross product. + * + * For vectors of size 2, the output is a scalar. + * Given vectors \f$ v = \begin{bmatrix} v_1 & v_2 \end{bmatrix} \f$ and \f$ w = \begin{bmatrix} w_1 & w_2 \end{bmatrix} \f$, + * the result is simply \f$ v\times w = \overline{v_1 w_2 - v_2 w_1} = \text{conj}\left|\begin{smallmatrix} v_1 & w_1 \\ v_2 & w_2 \end{smallmatrix}\right| \f$; + * or, to put it differently, it is the third coordinate of the cross product of \f$ \begin{bmatrix} v_1 & v_2 & v_3 \end{bmatrix} \f$ and \f$ \begin{bmatrix} w_1 & w_2 & w_3 \end{bmatrix} \f$. + * For real-valued inputs, the result can be interpreted as the signed area of a parallelogram spanned by the two vectors. + * + * \note With complex numbers, the cross product is implemented as + * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} + \mathbf{b} \times \mathbf{c})\f$ * * \sa MatrixBase::cross3() */ template template -#ifndef EIGEN_PARSED_BY_DOXYGEN EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename MatrixBase::template cross_product_return_type::type +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename internal::cross_impl::return_type #else -typename MatrixBase::PlainObject +inline std::conditional_t #endif MatrixBase::cross(const MatrixBase& other) const { - EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3) - EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3) - - // Note that there is no need for an expression here since the compiler - // optimize such a small temporary very well (even within a complex expression) - typename internal::nested_eval::type lhs(derived()); - typename internal::nested_eval::type rhs(other.derived()); - return typename cross_product_return_type::type( - numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)), - numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)), - numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0)) - ); + return internal::cross_impl::run(*this, other); } namespace internal { @@ -91,8 +137,8 @@ MatrixBase::cross3(const MatrixBase& other) const OtherDerivedNested rhs(other.derived()); return internal::cross3_impl::type, - typename internal::remove_all::type>::run(lhs,rhs); + internal::remove_all_t, + internal::remove_all_t>::run(lhs,rhs); } /** \geometry_module \ingroup Geometry_Module diff --git a/libs/eigen/Eigen/src/Geometry/ParametrizedLine.h b/libs/eigen/Eigen/src/Geometry/ParametrizedLine.h index 584f500..7576922 100644 --- a/libs/eigen/Eigen/src/Geometry/ParametrizedLine.h +++ b/libs/eigen/Eigen/src/Geometry/ParametrizedLine.h @@ -11,6 +11,8 @@ #ifndef EIGEN_PARAMETRIZEDLINE_H #define EIGEN_PARAMETRIZEDLINE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -23,19 +25,19 @@ namespace Eigen { * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$. * - * \tparam _Scalar the scalar type, i.e., the type of the coefficients - * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic. + * \tparam Scalar_ the scalar type, i.e., the type of the coefficients + * \tparam AmbientDim_ the dimension of the ambient space, can be a compile time value or Dynamic. */ -template +template class ParametrizedLine { public: - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_,AmbientDim_) enum { - AmbientDimAtCompileTime = _AmbientDim, - Options = _Options + AmbientDimAtCompileTime = AmbientDim_, + Options = Options_ }; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef typename NumTraits::Real RealScalar; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 typedef Matrix VectorType; @@ -59,7 +61,7 @@ public: : m_origin(origin), m_direction(direction) {} template - EIGEN_DEVICE_FUNC explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane); + EIGEN_DEVICE_FUNC explicit ParametrizedLine(const Hyperplane& hyperplane); /** Constructs a parametrized line going from \a p0 to \a p1. */ EIGEN_DEVICE_FUNC static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1) @@ -96,13 +98,13 @@ public: EIGEN_DEVICE_FUNC VectorType pointAt(const Scalar& t) const; template - EIGEN_DEVICE_FUNC Scalar intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const; - + EIGEN_DEVICE_FUNC Scalar intersectionParameter(const Hyperplane& hyperplane) const; + template - EIGEN_DEVICE_FUNC Scalar intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const; - + EIGEN_DEVICE_FUNC Scalar intersection(const Hyperplane& hyperplane) const; + template - EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const; + EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane& hyperplane) const; /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this. * @@ -178,9 +180,9 @@ protected: * * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line */ -template +template template -EIGEN_DEVICE_FUNC inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim,OtherOptions>& hyperplane) +EIGEN_DEVICE_FUNC inline ParametrizedLine::ParametrizedLine(const Hyperplane& hyperplane) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2) direction() = hyperplane.normal().unitOrthogonal(); @@ -189,18 +191,18 @@ EIGEN_DEVICE_FUNC inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::Parame /** \returns the point at \a t along this line */ -template -EIGEN_DEVICE_FUNC inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType -ParametrizedLine<_Scalar, _AmbientDim,_Options>::pointAt(const _Scalar& t) const +template +EIGEN_DEVICE_FUNC inline typename ParametrizedLine::VectorType +ParametrizedLine::pointAt(const Scalar_& t) const { return origin() + (direction()*t); } /** \returns the parameter value of the intersection between \c *this and the given \a hyperplane */ -template +template template -EIGEN_DEVICE_FUNC inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const +EIGEN_DEVICE_FUNC inline Scalar_ ParametrizedLine::intersectionParameter(const Hyperplane& hyperplane) const { return -(hyperplane.offset()+hyperplane.normal().dot(origin())) / hyperplane.normal().dot(direction()); @@ -210,19 +212,19 @@ EIGEN_DEVICE_FUNC inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options> /** \deprecated use intersectionParameter() * \returns the parameter value of the intersection between \c *this and the given \a hyperplane */ -template +template template -EIGEN_DEVICE_FUNC inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const +EIGEN_DEVICE_FUNC inline Scalar_ ParametrizedLine::intersection(const Hyperplane& hyperplane) const { return intersectionParameter(hyperplane); } /** \returns the point of the intersection between \c *this and the given hyperplane */ -template +template template -EIGEN_DEVICE_FUNC inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType -ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const +EIGEN_DEVICE_FUNC inline typename ParametrizedLine::VectorType +ParametrizedLine::intersectionPoint(const Hyperplane& hyperplane) const { return pointAt(intersectionParameter(hyperplane)); } diff --git a/libs/eigen/Eigen/src/Geometry/Quaternion.h b/libs/eigen/Eigen/src/Geometry/Quaternion.h index 3259e59..0aca4c4 100644 --- a/libs/eigen/Eigen/src/Geometry/Quaternion.h +++ b/libs/eigen/Eigen/src/Geometry/Quaternion.h @@ -10,6 +10,8 @@ #ifndef EIGEN_QUATERNION_H #define EIGEN_QUATERNION_H +#include "./InternalHeaderCheck.h" + namespace Eigen { @@ -44,8 +46,8 @@ class QuaternionBase : public RotationBase typedef typename NumTraits::Real RealScalar; typedef typename internal::traits::Coefficients Coefficients; typedef typename Coefficients::CoeffReturnType CoeffReturnType; - typedef typename internal::conditional::Flags&LvalueBit), - Scalar&, CoeffReturnType>::type NonConstCoeffReturnType; + typedef std::conditional_t::Flags&LvalueBit), + Scalar&, CoeffReturnType> NonConstCoeffReturnType; enum { @@ -198,14 +200,14 @@ class QuaternionBase : public RotationBase template EIGEN_DEVICE_FUNC inline - typename internal::enable_if::value,const Derived&>::type cast() const + std::enable_if_t::value,const Derived&> cast() const { return derived(); } template EIGEN_DEVICE_FUNC inline - typename internal::enable_if::value,Quaternion >::type cast() const + std::enable_if_t::value,Quaternion > cast() const { return Quaternion(coeffs().template cast()); } @@ -236,8 +238,8 @@ protected: * * \brief The quaternion class used to represent 3D orientations and rotations * - * \tparam _Scalar the scalar type, i.e., the type of the coefficients - * \tparam _Options controls the memory alignment of the coefficients. Can be \# AutoAlign or \# DontAlign. Default is AutoAlign. + * \tparam Scalar_ the scalar type, i.e., the type of the coefficients + * \tparam Options_ controls the memory alignment of the coefficients. Can be \# AutoAlign or \# DontAlign. Default is AutoAlign. * * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of * orientations and rotations of objects in three dimensions. Compared to other representations @@ -256,12 +258,12 @@ protected: */ namespace internal { -template -struct traits > +template +struct traits > { - typedef Quaternion<_Scalar,_Options> PlainObject; - typedef _Scalar Scalar; - typedef Matrix<_Scalar,4,1,_Options> Coefficients; + typedef Quaternion PlainObject; + typedef Scalar_ Scalar; + typedef Matrix Coefficients; enum{ Alignment = internal::traits::Alignment, Flags = LvalueBit @@ -269,14 +271,14 @@ struct traits > }; } -template -class Quaternion : public QuaternionBase > +template +class Quaternion : public QuaternionBase > { public: - typedef QuaternionBase > Base; + typedef QuaternionBase > Base; enum { NeedsAlignment = internal::traits::Alignment>0 }; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Quaternion) using Base::operator*=; @@ -307,7 +309,7 @@ public: /** Constructs and initializes a quaternion from either: * - a rotation matrix expression, - * - a 4D vector expression representing quaternion coefficients. + * - a 4D vector expression representing quaternion coefficients in the order [\c x, \c y, \c z, \c w]. */ template EIGEN_DEVICE_FUNC explicit inline Quaternion(const MatrixBase& other) { *this = other; } @@ -317,7 +319,6 @@ public: EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion& other) { m_coeffs = other.coeffs().template cast(); } -#if EIGEN_HAS_RVALUE_REFERENCES // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator. /** Default move constructor */ EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) @@ -330,7 +331,6 @@ public: m_coeffs = std::move(other.coeffs()); return *this; } -#endif EIGEN_DEVICE_FUNC static Quaternion UnitRandom(); @@ -341,20 +341,17 @@ public: EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs;} EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment)) - + #ifdef EIGEN_QUATERNION_PLUGIN # include EIGEN_QUATERNION_PLUGIN #endif protected: Coefficients m_coeffs; - + #ifndef EIGEN_PARSED_BY_DOXYGEN - static EIGEN_STRONG_INLINE void _check_template_params() - { - EIGEN_STATIC_ASSERT( (_Options & DontAlign) == _Options, - INVALID_MATRIX_TEMPLATE_PARAMETERS) - } + EIGEN_STATIC_ASSERT( (Options_ & DontAlign) == Options_, + INVALID_MATRIX_TEMPLATE_PARAMETERS) #endif }; @@ -370,19 +367,19 @@ typedef Quaternion Quaterniond; ***************************************************************************/ namespace internal { - template - struct traits, _Options> > : traits > + template + struct traits, Options_> > : traits > { - typedef Map, _Options> Coefficients; + typedef Map, Options_> Coefficients; }; } namespace internal { - template - struct traits, _Options> > : traits > + template + struct traits, Options_> > : traits > { - typedef Map, _Options> Coefficients; - typedef traits > TraitsBase; + typedef Map, Options_> Coefficients; + typedef traits > TraitsBase; enum { Flags = TraitsBase::Flags & ~LvalueBit }; @@ -392,22 +389,22 @@ namespace internal { /** \ingroup Geometry_Module * \brief Quaternion expression mapping a constant memory buffer * - * \tparam _Scalar the type of the Quaternion coefficients - * \tparam _Options see class Map + * \tparam Scalar_ the type of the Quaternion coefficients + * \tparam Options_ see class Map * * This is a specialization of class Map for Quaternion. This class allows to view * a 4 scalar memory buffer as an Eigen's Quaternion object. * * \sa class Map, class Quaternion, class QuaternionBase */ -template -class Map, _Options > - : public QuaternionBase, _Options> > +template +class Map, Options_ > + : public QuaternionBase, Options_> > { public: - typedef QuaternionBase, _Options> > Base; + typedef QuaternionBase, Options_> > Base; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef typename internal::traits::Coefficients Coefficients; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map) using Base::operator*=; @@ -417,7 +414,7 @@ class Map, _Options > * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order: * \code *coeffs == {x, y, z, w} \endcode * - * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */ + * If the template parameter Options_ is set to #Aligned, then the pointer coeffs must be aligned. */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {} EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs;} @@ -429,22 +426,22 @@ class Map, _Options > /** \ingroup Geometry_Module * \brief Expression of a quaternion from a memory buffer * - * \tparam _Scalar the type of the Quaternion coefficients - * \tparam _Options see class Map + * \tparam Scalar_ the type of the Quaternion coefficients + * \tparam Options_ see class Map * * This is a specialization of class Map for Quaternion. This class allows to view * a 4 scalar memory buffer as an Eigen's Quaternion object. * * \sa class Map, class Quaternion, class QuaternionBase */ -template -class Map, _Options > - : public QuaternionBase, _Options> > +template +class Map, Options_ > + : public QuaternionBase, Options_> > { public: - typedef QuaternionBase, _Options> > Base; + typedef QuaternionBase, Options_> > Base; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef typename internal::traits::Coefficients Coefficients; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map) using Base::operator*=; @@ -454,7 +451,7 @@ class Map, _Options > * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order: * \code *coeffs == {x, y, z, w} \endcode * - * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */ + * If the template parameter Options_ is set to #Aligned, then the pointer coeffs must be aligned. */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {} EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; } @@ -654,7 +651,7 @@ EIGEN_DEVICE_FUNC inline Derived& QuaternionBase::setFromTwoVectors(con { c = numext::maxi(c,Scalar(-1)); Matrix m; m << v0.transpose(), v1.transpose(); - JacobiSVD > svd(m, ComputeFullV); + JacobiSVD, ComputeFullV> svd(m); Vector3 axis = svd.matrixV().col(2); Scalar w2 = (Scalar(1)+c)*Scalar(0.5); diff --git a/libs/eigen/Eigen/src/Geometry/Rotation2D.h b/libs/eigen/Eigen/src/Geometry/Rotation2D.h index d0bd575..aa7f863 100644 --- a/libs/eigen/Eigen/src/Geometry/Rotation2D.h +++ b/libs/eigen/Eigen/src/Geometry/Rotation2D.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ROTATION2D_H #define EIGEN_ROTATION2D_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -18,7 +20,7 @@ namespace Eigen { * * \brief Represents a rotation/orientation in a 2 dimensional space. * - * \tparam _Scalar the scalar type, i.e., the type of the coefficients + * \tparam Scalar_ the scalar type, i.e., the type of the coefficients * * This class is equivalent to a single scalar representing a counter clock wise rotation * as a single angle in radian. It provides some additional features such as the automatic @@ -31,16 +33,16 @@ namespace Eigen { namespace internal { -template struct traits > +template struct traits > { - typedef _Scalar Scalar; + typedef Scalar_ Scalar; }; } // end namespace internal -template -class Rotation2D : public RotationBase,2> +template +class Rotation2D : public RotationBase,2> { - typedef RotationBase,2> Base; + typedef RotationBase,2> Base; public: @@ -48,7 +50,7 @@ public: enum { Dim = 2 }; /** the scalar type of the coefficients */ - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Matrix Vector2; typedef Matrix Matrix2; diff --git a/libs/eigen/Eigen/src/Geometry/RotationBase.h b/libs/eigen/Eigen/src/Geometry/RotationBase.h index f0ee0bd..f21277f 100644 --- a/libs/eigen/Eigen/src/Geometry/RotationBase.h +++ b/libs/eigen/Eigen/src/Geometry/RotationBase.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ROTATIONBASE_H #define EIGEN_ROTATIONBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { // forward declaration @@ -23,13 +25,13 @@ struct rotation_base_generic_product_selector; * \brief Common base class for compact rotation representations * * \tparam Derived is the derived type, i.e., a rotation type - * \tparam _Dim the dimension of the space + * \tparam Dim_ the dimension of the space */ -template +template class RotationBase { public: - enum { Dim = _Dim }; + enum { Dim = Dim_ }; /** the scalar type of the coefficients */ typedef typename internal::traits::Scalar Scalar; @@ -135,9 +137,9 @@ struct rotation_base_generic_product_selector +template template -EIGEN_DEVICE_FUNC Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols> +EIGEN_DEVICE_FUNC Matrix ::Matrix(const RotationBase& r) { EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim)) @@ -148,10 +150,10 @@ EIGEN_DEVICE_FUNC Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols> * * \brief Set a Dim x Dim rotation matrix from the rotation \a r */ -template +template template -EIGEN_DEVICE_FUNC Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>& -Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols> +EIGEN_DEVICE_FUNC Matrix& +Matrix ::operator=(const RotationBase& r) { EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim)) diff --git a/libs/eigen/Eigen/src/Geometry/Scaling.h b/libs/eigen/Eigen/src/Geometry/Scaling.h index d352f1f..8bcdce6 100644 --- a/libs/eigen/Eigen/src/Geometry/Scaling.h +++ b/libs/eigen/Eigen/src/Geometry/Scaling.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SCALING_H #define EIGEN_SCALING_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -18,7 +20,7 @@ namespace Eigen { * * \brief Represents a generic uniform scaling transformation * - * \tparam _Scalar the scalar type, i.e., the type of the coefficients. + * \tparam Scalar_ the scalar type, i.e., the type of the coefficients. * * This class represent a uniform scaling transformation. It is the return * type of Scaling(Scalar), and most of the time this is the only way it @@ -45,12 +47,12 @@ namespace internal }; } -template +template class UniformScaling { public: /** the scalar type of the coefficients */ - typedef _Scalar Scalar; + typedef Scalar_ Scalar; protected: @@ -160,6 +162,11 @@ template inline const DiagonalWrapper Scaling(const MatrixBase& coeffs) { return coeffs.asDiagonal(); } +/** Constructs an axis aligned scaling expression from vector \a coeffs when passed as an rvalue reference */ +template +inline typename DiagonalWrapper::PlainObject Scaling(MatrixBase&& coeffs) +{ return typename DiagonalWrapper::PlainObject(std::move(coeffs.derived())); } + /** \deprecated */ typedef DiagonalMatrix AlignedScaling2f; /** \deprecated */ diff --git a/libs/eigen/Eigen/src/Geometry/Transform.h b/libs/eigen/Eigen/src/Geometry/Transform.h index 52b8c2a..fd0ae7e 100644 --- a/libs/eigen/Eigen/src/Geometry/Transform.h +++ b/libs/eigen/Eigen/src/Geometry/Transform.h @@ -12,6 +12,8 @@ #ifndef EIGEN_TRANSFORM_H #define EIGEN_TRANSFORM_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -63,15 +65,15 @@ struct transform_construct_from_matrix; template struct transform_take_affine_part; -template -struct traits > +template +struct traits > { - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Eigen::Index StorageIndex; typedef Dense StorageKind; enum { - Dim1 = _Dim==Dynamic ? _Dim : _Dim + 1, - RowsAtCompileTime = _Mode==Projective ? Dim1 : _Dim, + Dim1 = Dim_==Dynamic ? Dim_ : Dim_ + 1, + RowsAtCompileTime = Mode_==Projective ? Dim1 : Dim_, ColsAtCompileTime = Dim1, MaxRowsAtCompileTime = RowsAtCompileTime, MaxColsAtCompileTime = ColsAtCompileTime, @@ -89,9 +91,9 @@ template struct transform_make_affine; * * \brief Represents an homogeneous transformation in a N dimensional space * - * \tparam _Scalar the scalar type, i.e., the type of the coefficients - * \tparam _Dim the dimension of the space - * \tparam _Mode the type of the transformation. Can be: + * \tparam Scalar_ the scalar type, i.e., the type of the coefficients + * \tparam Dim_ the dimension of the space + * \tparam Mode_ the type of the transformation. Can be: * - #Affine: the transformation is stored as a (Dim+1)^2 matrix, * where the last row is assumed to be [0 ... 0 1]. * - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix. @@ -100,7 +102,7 @@ template struct transform_make_affine; * - #Isometry: same as #Affine with the additional assumption that * the linear part represents a rotation. This assumption is exploited * to speed up some functions such as inverse() and rotation(). - * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor. + * \tparam Options_ has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor. * These Options are passed directly to the underlying matrix type. * * The homography is internally represented and stored by a matrix which @@ -200,20 +202,20 @@ template struct transform_make_affine; * * \sa class Matrix, class Quaternion */ -template +template class Transform { public: - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim==Dynamic ? Dynamic : (_Dim+1)*(_Dim+1)) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_,Dim_==Dynamic ? Dynamic : (Dim_+1)*(Dim_+1)) enum { - Mode = _Mode, - Options = _Options, - Dim = _Dim, ///< space dimension in which the transformation holds - HDim = _Dim+1, ///< size of a respective homogeneous vector + Mode = Mode_, + Options = Options_, + Dim = Dim_, ///< space dimension in which the transformation holds + HDim = Dim_+1, ///< size of a respective homogeneous vector Rows = int(Mode)==(AffineCompact) ? Dim : HDim }; /** the scalar type of the coefficients */ - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Eigen::Index StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 /** type of the matrix used to represent the transformation */ @@ -227,13 +229,13 @@ public: /** type of read reference to the linear part of the transformation */ typedef const Block ConstLinearPart; /** type of read/write reference to the affine part of the transformation */ - typedef typename internal::conditional >::type AffinePart; + Block > AffinePart; /** type of read reference to the affine part of the transformation */ - typedef typename internal::conditional >::type ConstAffinePart; + const Block > ConstAffinePart; /** type of a vector */ typedef Matrix VectorType; /** type of a read/write reference to the translation part of the rotation */ @@ -317,12 +319,12 @@ public: check_template_params(); // prevent conversions as: // Affine | AffineCompact | Isometry = Projective - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(OtherMode==int(Projective), Mode==int(Projective)), + EIGEN_STATIC_ASSERT(internal::check_implication(OtherMode==int(Projective), Mode==int(Projective)), YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION) // prevent conversions as: // Isometry = Affine | AffineCompact - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(OtherMode==int(Affine)||OtherMode==int(AffineCompact), Mode!=int(Isometry)), + EIGEN_STATIC_ASSERT(internal::check_implication(OtherMode==int(Affine)||OtherMode==int(AffineCompact), Mode!=int(Isometry)), YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION) enum { ModeIsAffineCompact = Mode == int(AffineCompact), @@ -367,9 +369,11 @@ public: } #ifdef EIGEN_QT_SUPPORT + #if (QT_VERSION < QT_VERSION_CHECK(6, 0, 0)) inline Transform(const QMatrix& other); inline Transform& operator=(const QMatrix& other); inline QMatrix toQMatrix(void) const; + #endif inline Transform(const QTransform& other); inline Transform& operator=(const QTransform& other); inline QTransform toQTransform(void) const; @@ -443,7 +447,7 @@ public: * \li a general transformation matrix of size Dim+1 x Dim+1. */ template friend - EIGEN_DEVICE_FUNC inline const typename internal::transform_left_product_impl::ResultType + EIGEN_DEVICE_FUNC inline const typename internal::transform_left_product_impl::ResultType operator * (const EigenBase &a, const Transform &b) { return internal::transform_left_product_impl::run(a.derived(),b); } @@ -596,7 +600,7 @@ public: template EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase& r) const; - typedef typename internal::conditional::type RotationReturnType; + typedef std::conditional_t RotationReturnType; EIGEN_DEVICE_FUNC RotationReturnType rotation() const; template @@ -732,6 +736,8 @@ typedef Transform Projective3d; **************************/ #ifdef EIGEN_QT_SUPPORT + +#if (QT_VERSION < QT_VERSION_CHECK(6, 0, 0)) /** Initializes \c *this from a QMatrix assuming the dimension is 2. * * This function is available only if the token EIGEN_QT_SUPPORT is defined. @@ -776,6 +782,7 @@ QMatrix Transform::toQMatrix(void) const m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(0,2), m_matrix.coeff(1,2)); } +#endif /** Initializes \c *this from a QTransform assuming the dimension is 2. * @@ -1098,7 +1105,7 @@ template EIGEN_DEVICE_FUNC void Transform::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const { // Note that JacobiSVD is faster than BDCSVD for small matrices. - JacobiSVD svd(linear(), ComputeFullU | ComputeFullV); + JacobiSVD svd(linear()); Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1 VectorType sv(svd.singularValues()); @@ -1128,7 +1135,7 @@ template EIGEN_DEVICE_FUNC void Transform::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const { // Note that JacobiSVD is faster than BDCSVD for small matrices. - JacobiSVD svd(linear(), ComputeFullU | ComputeFullV); + JacobiSVD svd(linear()); Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1 VectorType sv(svd.singularValues()); @@ -1259,17 +1266,17 @@ template struct transform_take_affine_part { typedef typename TransformType::MatrixType MatrixType; typedef typename TransformType::AffinePart AffinePart; typedef typename TransformType::ConstAffinePart ConstAffinePart; - static inline AffinePart run(MatrixType& m) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AffinePart run(MatrixType& m) { return m.template block(0,0); } - static inline ConstAffinePart run(const MatrixType& m) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstAffinePart run(const MatrixType& m) { return m.template block(0,0); } }; template struct transform_take_affine_part > { typedef typename Transform::MatrixType MatrixType; - static inline MatrixType& run(MatrixType& m) { return m; } - static inline const MatrixType& run(const MatrixType& m) { return m; } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MatrixType& run(MatrixType& m) { return m; } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const MatrixType& run(const MatrixType& m) { return m; } }; /***************************************************** @@ -1279,7 +1286,7 @@ struct transform_take_affine_part > template struct transform_construct_from_matrix { - static inline void run(Transform *transform, const Other& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform *transform, const Other& other) { transform->linear() = other; transform->translation().setZero(); @@ -1290,7 +1297,7 @@ struct transform_construct_from_matrix template struct transform_construct_from_matrix { - static inline void run(Transform *transform, const Other& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform *transform, const Other& other) { transform->affine() = other; transform->makeAffine(); @@ -1300,14 +1307,14 @@ struct transform_construct_from_matrix template struct transform_construct_from_matrix { - static inline void run(Transform *transform, const Other& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform *transform, const Other& other) { transform->matrix() = other; } }; template struct transform_construct_from_matrix { - static inline void run(Transform *transform, const Other& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Transform *transform, const Other& other) { transform->matrix() = other.template block(0,0); } }; @@ -1397,7 +1404,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is Dim = TransformType::Dim, HDim = TransformType::HDim, OtherRows = MatrixType::RowsAtCompileTime, - WorkingRows = EIGEN_PLAIN_ENUM_MIN(TransformMatrix::RowsAtCompileTime,HDim) + WorkingRows = plain_enum_min(TransformMatrix::RowsAtCompileTime, HDim) }; typedef typename MatrixType::PlainObject ResultType; diff --git a/libs/eigen/Eigen/src/Geometry/Translation.h b/libs/eigen/Eigen/src/Geometry/Translation.h index 8c22901..dd0adba 100644 --- a/libs/eigen/Eigen/src/Geometry/Translation.h +++ b/libs/eigen/Eigen/src/Geometry/Translation.h @@ -10,6 +10,8 @@ #ifndef EIGEN_TRANSLATION_H #define EIGEN_TRANSLATION_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \geometry_module \ingroup Geometry_Module @@ -18,23 +20,23 @@ namespace Eigen { * * \brief Represents a translation transformation * - * \tparam _Scalar the scalar type, i.e., the type of the coefficients. - * \tparam _Dim the dimension of the space, can be a compile time value or Dynamic + * \tparam Scalar_ the scalar type, i.e., the type of the coefficients. + * \tparam Dim_ the dimension of the space, can be a compile time value or Dynamic * * \note This class is not aimed to be used to store a translation transformation, * but rather to make easier the constructions and updates of Transform objects. * * \sa class Scaling, class Transform */ -template +template class Translation { public: - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_,Dim_) /** dimension of the space */ - enum { Dim = _Dim }; + enum { Dim = Dim_ }; /** the scalar type of the coefficients */ - typedef _Scalar Scalar; + typedef Scalar_ Scalar; /** corresponding vector type */ typedef Matrix VectorType; /** corresponding linear transformation matrix type */ @@ -131,7 +133,7 @@ public: /** Applies translation to vector */ template - inline typename internal::enable_if::type + inline std::enable_if_t operator* (const MatrixBase& vec) const { return m_coeffs + vec.derived(); } diff --git a/libs/eigen/Eigen/src/Geometry/Umeyama.h b/libs/eigen/Eigen/src/Geometry/Umeyama.h index 6b75500..8049787 100644 --- a/libs/eigen/Eigen/src/Geometry/Umeyama.h +++ b/libs/eigen/Eigen/src/Geometry/Umeyama.h @@ -16,6 +16,8 @@ // * Eigen/SVD // * Eigen/Array +#include "./InternalHeaderCheck.h" + namespace Eigen { #ifndef EIGEN_PARSED_BY_DOXYGEN @@ -32,10 +34,10 @@ template struct umeyama_transform_matrix_type { enum { - MinRowsAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(MatrixType::RowsAtCompileTime, OtherMatrixType::RowsAtCompileTime), + MinRowsAtCompileTime = internal::min_size_prefer_dynamic(MatrixType::RowsAtCompileTime, OtherMatrixType::RowsAtCompileTime), // When possible we want to choose some small fixed size value since the result - // is likely to fit on the stack. So here, EIGEN_SIZE_MIN_PREFER_DYNAMIC is not what we want. + // is likely to fit on the stack. So here, min_size_prefer_dynamic is not what we want. HomogeneousDimension = int(MinRowsAtCompileTime) == Dynamic ? Dynamic : int(MinRowsAtCompileTime)+1 }; @@ -102,7 +104,7 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo EIGEN_STATIC_ASSERT((internal::is_same::Scalar>::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - enum { Dimension = EIGEN_SIZE_MIN_PREFER_DYNAMIC(Derived::RowsAtCompileTime, OtherDerived::RowsAtCompileTime) }; + enum { Dimension = internal::min_size_prefer_dynamic(Derived::RowsAtCompileTime, OtherDerived::RowsAtCompileTime) }; typedef Matrix VectorType; typedef Matrix MatrixType; @@ -122,13 +124,10 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo const RowMajorMatrixType src_demean = src.colwise() - src_mean; const RowMajorMatrixType dst_demean = dst.colwise() - dst_mean; - // Eq. (36)-(37) - const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n; - // Eq. (38) const MatrixType sigma = one_over_n * dst_demean * src_demean.transpose(); - JacobiSVD svd(sigma, ComputeFullU | ComputeFullV); + JacobiSVD svd(sigma); // Initialize the resulting transformation with an identity matrix... TransformationMatrixType Rt = TransformationMatrixType::Identity(m+1,m+1); @@ -144,6 +143,9 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo if (with_scaling) { + // Eq. (36)-(37) + const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n; + // Eq. (42) const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S); diff --git a/libs/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h b/libs/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h index 9af6a9a..bd91949 100644 --- a/libs/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +++ b/libs/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h @@ -11,6 +11,8 @@ #ifndef EIGEN_GEOMETRY_SIMD_H #define EIGEN_GEOMETRY_SIMD_H +#include "../InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/Householder/BlockHouseholder.h b/libs/eigen/Eigen/src/Householder/BlockHouseholder.h index 39ce1c2..a5c8095 100644 --- a/libs/eigen/Eigen/src/Householder/BlockHouseholder.h +++ b/libs/eigen/Eigen/src/Householder/BlockHouseholder.h @@ -13,6 +13,8 @@ // This file contains some helper function to deal with block householder reflectors +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -85,7 +87,7 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c template void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs, bool forward) { - enum { TFactorSize = MatrixType::ColsAtCompileTime }; + enum { TFactorSize = VectorsType::ColsAtCompileTime }; Index nbVecs = vectors.cols(); Matrix T(nbVecs,nbVecs); diff --git a/libs/eigen/Eigen/src/Householder/Householder.h b/libs/eigen/Eigen/src/Householder/Householder.h index 5bc037f..855b752 100644 --- a/libs/eigen/Eigen/src/Householder/Householder.h +++ b/libs/eigen/Eigen/src/Householder/Householder.h @@ -11,6 +11,8 @@ #ifndef EIGEN_HOUSEHOLDER_H #define EIGEN_HOUSEHOLDER_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -69,7 +71,7 @@ void MatrixBase::makeHouseholder( Scalar& tau, RealScalar& beta) const { - using std::sqrt; + using numext::sqrt; using numext::conj; EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart) @@ -122,7 +124,7 @@ void MatrixBase::applyHouseholderOnTheLeft( { *this *= Scalar(1)-tau; } - else if(tau!=Scalar(0)) + else if(!numext::is_exactly_zero(tau)) { Map::type> tmp(workspace,cols()); Block bottom(derived(), 1, 0, rows()-1, cols()); @@ -160,7 +162,7 @@ void MatrixBase::applyHouseholderOnTheRight( { *this *= Scalar(1)-tau; } - else if(tau!=Scalar(0)) + else if(!numext::is_exactly_zero(tau)) { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); diff --git a/libs/eigen/Eigen/src/Householder/HouseholderSequence.h b/libs/eigen/Eigen/src/Householder/HouseholderSequence.h index 022f6c3..41fef64 100644 --- a/libs/eigen/Eigen/src/Householder/HouseholderSequence.h +++ b/libs/eigen/Eigen/src/Householder/HouseholderSequence.h @@ -11,6 +11,8 @@ #ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H #define EIGEN_HOUSEHOLDER_SEQUENCE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup Householder_Module @@ -131,34 +133,34 @@ template class HouseholderS typedef typename internal::traits::Scalar Scalar; typedef HouseholderSequence< - typename internal::conditional::IsComplex, - typename internal::remove_all::type, - VectorsType>::type, - typename internal::conditional::IsComplex, - typename internal::remove_all::type, - CoeffsType>::type, + std::conditional_t::IsComplex, + internal::remove_all_t, + VectorsType>, + std::conditional_t::IsComplex, + internal::remove_all_t, + CoeffsType>, Side > ConjugateReturnType; typedef HouseholderSequence< VectorsType, - typename internal::conditional::IsComplex, - typename internal::remove_all::type, - CoeffsType>::type, + std::conditional_t::IsComplex, + internal::remove_all_t, + CoeffsType>, Side > AdjointReturnType; typedef HouseholderSequence< - typename internal::conditional::IsComplex, - typename internal::remove_all::type, - VectorsType>::type, + std::conditional_t::IsComplex, + internal::remove_all_t, + VectorsType>, CoeffsType, Side > TransposeReturnType; typedef HouseholderSequence< - typename internal::add_const::type, - typename internal::add_const::type, + std::add_const_t, + std::add_const_t, Side > ConstHouseholderSequence; @@ -255,10 +257,10 @@ template class HouseholderS */ template EIGEN_DEVICE_FUNC - inline typename internal::conditional::type + inline std::conditional_t conjugateIf() const { - typedef typename internal::conditional::type ReturnType; + typedef std::conditional_t ReturnType; return ReturnType(m_vectors.template conjugateIf(), m_coeffs.template conjugateIf()); } @@ -382,21 +384,25 @@ template class HouseholderS Index bs = end-k; Index start = k + m_shift; - typedef Block::type,Dynamic,Dynamic> SubVectorsType; + typedef Block,Dynamic,Dynamic> SubVectorsType; SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start, Side==OnTheRight ? start : k, Side==OnTheRight ? bs : m_vectors.rows()-start, Side==OnTheRight ? m_vectors.cols()-start : bs); - typename internal::conditional, SubVectorsType&>::type sub_vecs(sub_vecs1); + std::conditional_t, SubVectorsType&> sub_vecs(sub_vecs1); - Index dstStart = dst.rows()-rows()+m_shift+k; Index dstRows = rows()-m_shift-k; - Block sub_dst(dst, - dstStart, - inputIsIdentity ? dstStart : 0, - dstRows, - inputIsIdentity ? dstRows : dst.cols()); - apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); + + if (inputIsIdentity) + { + Block sub_dst = dst.bottomRightCorner(dstRows, dstRows); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); + } + else + { + auto sub_dst = dst.bottomRows(dstRows); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); + } } } else @@ -405,9 +411,18 @@ template class HouseholderS for(Index k = 0; k < m_length; ++k) { Index actual_k = m_reverse ? k : m_length-k-1; - Index dstStart = rows()-m_shift-actual_k; - dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols()) - .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); + Index dstRows = rows()-m_shift-actual_k; + + if (inputIsIdentity) + { + Block sub_dst = dst.bottomRightCorner(dstRows, dstRows); + sub_dst.applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); + } + else + { + auto sub_dst = dst.bottomRows(dstRows); + sub_dst.applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); + } } } } @@ -428,7 +443,7 @@ template class HouseholderS return res; } - template friend struct internal::hseq_side_dependent_impl; + template friend struct internal::hseq_side_dependent_impl; /** \brief Sets the length of the Householder sequence. * \param [in] length New value for the length. diff --git a/libs/eigen/Eigen/src/Householder/InternalHeaderCheck.h b/libs/eigen/Eigen/src/Householder/InternalHeaderCheck.h new file mode 100644 index 0000000..70de89b --- /dev/null +++ b/libs/eigen/Eigen/src/Householder/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_HOUSEHOLDER_MODULE_H +#error "Please include Eigen/Householder instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/libs/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index a117fc1..d2d55b7 100644 --- a/libs/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/libs/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -10,6 +10,8 @@ #ifndef EIGEN_BASIC_PRECONDITIONERS_H #define EIGEN_BASIC_PRECONDITIONERS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup IterativeLinearSolvers_Module @@ -21,7 +23,7 @@ namespace Eigen { A.diagonal().asDiagonal() . x = b \endcode * - * \tparam _Scalar the type of the scalar. + * \tparam Scalar_ the type of the scalar. * * \implsparsesolverconcept * @@ -32,10 +34,10 @@ namespace Eigen { * * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient */ -template +template class DiagonalPreconditioner { - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Matrix Vector; public: typedef typename Vector::StorageIndex StorageIndex; @@ -116,7 +118,7 @@ class DiagonalPreconditioner (A.adjoint() * A).diagonal().asDiagonal() * x = b \endcode * - * \tparam _Scalar the type of the scalar. + * \tparam Scalar_ the type of the scalar. * * \implsparsesolverconcept * @@ -124,12 +126,12 @@ class DiagonalPreconditioner * * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner */ -template -class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> +template +class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner { - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef typename NumTraits::Real RealScalar; - typedef DiagonalPreconditioner<_Scalar> Base; + typedef DiagonalPreconditioner Base; using Base::m_invdiag; public: diff --git a/libs/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/libs/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 153acef..76195c7 100644 --- a/libs/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/libs/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -11,6 +11,8 @@ #ifndef EIGEN_BICGSTAB_H #define EIGEN_BICGSTAB_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -49,9 +51,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, x.setZero(); return true; } - Scalar rho = 1; - Scalar alpha = 1; - Scalar w = 1; + Scalar rho (1); + Scalar alpha (1); + Scalar w (1); VectorType v = VectorType::Zero(n), p = VectorType::Zero(n); VectorType y(n), z(n); @@ -108,17 +110,17 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, } -template< typename _MatrixType, - typename _Preconditioner = DiagonalPreconditioner > +template< typename MatrixType_, + typename Preconditioner_ = DiagonalPreconditioner > class BiCGSTAB; namespace internal { -template< typename _MatrixType, typename _Preconditioner> -struct traits > +template< typename MatrixType_, typename Preconditioner_> +struct traits > { - typedef _MatrixType MatrixType; - typedef _Preconditioner Preconditioner; + typedef MatrixType_ MatrixType; + typedef Preconditioner_ Preconditioner; }; } @@ -129,8 +131,8 @@ struct traits > * This class allows to solve for A.x = b sparse linear problems using a bi conjugate gradient * stabilized algorithm. The vectors x and b can be either dense or sparse. * - * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix. - * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner + * \tparam MatrixType_ the type of the sparse matrix A, can be a dense or a sparse matrix. + * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner * * \implsparsesolverconcept * @@ -154,8 +156,8 @@ struct traits > * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ -template< typename _MatrixType, typename _Preconditioner> -class BiCGSTAB : public IterativeSolverBase > +template< typename MatrixType_, typename Preconditioner_> +class BiCGSTAB : public IterativeSolverBase > { typedef IterativeSolverBase Base; using Base::matrix; @@ -164,10 +166,10 @@ class BiCGSTAB : public IterativeSolverBase VectorType; @@ -56,7 +56,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, if (residualNorm2 < threshold) { iters = 0; - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); return; } @@ -86,23 +86,23 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, p = z + beta * p; // update search direction i++; } - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); iters = i; } } -template< typename _MatrixType, int _UpLo=Lower, - typename _Preconditioner = DiagonalPreconditioner > +template< typename MatrixType_, int UpLo_=Lower, + typename Preconditioner_ = DiagonalPreconditioner > class ConjugateGradient; namespace internal { -template< typename _MatrixType, int _UpLo, typename _Preconditioner> -struct traits > +template< typename MatrixType_, int UpLo_, typename Preconditioner_> +struct traits > { - typedef _MatrixType MatrixType; - typedef _Preconditioner Preconditioner; + typedef MatrixType_ MatrixType; + typedef Preconditioner_ Preconditioner; }; } @@ -113,11 +113,11 @@ struct traits > * This class allows to solve for A.x = b linear problems using an iterative conjugate gradient algorithm. * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse. * - * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix. - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * \tparam MatrixType_ the type of the matrix A, can be a dense or a sparse matrix. + * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower, * \c Upper, or \c Lower|Upper in which the full matrix entries will be considered. * Default is \c Lower, best performance is \c Lower|Upper. - * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner + * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner * * \implsparsesolverconcept * @@ -127,8 +127,8 @@ struct traits > * * The tolerance corresponds to the relative residual error: |Ax-b|/|b| * - * \b Performance: Even though the default value of \c _UpLo is \c Lower, significantly higher performance is - * achieved when using a complete matrix and \b Lower|Upper as the \a _UpLo template parameter. Moreover, in this + * \b Performance: Even though the default value of \c UpLo_ is \c Lower, significantly higher performance is + * achieved when using a complete matrix and \b Lower|Upper as the \a UpLo_ template parameter. Moreover, in this * case multi-threading can be exploited if the user code is compiled with OpenMP enabled. * See \ref TopicMultiThreading for details. * @@ -154,8 +154,8 @@ struct traits > * * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ -template< typename _MatrixType, int _UpLo, typename _Preconditioner> -class ConjugateGradient : public IterativeSolverBase > +template< typename MatrixType_, int UpLo_, typename Preconditioner_> +class ConjugateGradient : public IterativeSolverBase > { typedef IterativeSolverBase Base; using Base::matrix; @@ -164,13 +164,13 @@ class ConjugateGradient : public IterativeSolverBase::IsComplex) }; - typedef typename internal::conditional, ActualMatrixType const&>::type RowMajorWrapper; - EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY); - typedef typename internal::conditional::Type - >::type SelfAdjointWrapper; + typedef std::conditional_t, ActualMatrixType const&> RowMajorWrapper; + EIGEN_STATIC_ASSERT(internal::check_implication(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY); + typedef std::conditional_t::Type + > SelfAdjointWrapper; m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; diff --git a/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 7803fd8..e697f32 100644 --- a/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -14,6 +14,8 @@ #include #include +#include "./InternalHeaderCheck.h" + namespace Eigen { /** * \brief Modified Incomplete Cholesky with dual threshold @@ -22,9 +24,9 @@ namespace Eigen { * Limited memory, SIAM J. Sci. Comput. 21(1), pp. 24-45, 1999 * * \tparam Scalar the scalar type of the input matrices - * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower + * \tparam UpLo_ The triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. - * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering, + * \tparam OrderingType_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering, * unless EIGEN_MPL2_ONLY is defined, in which case the default is NaturalOrdering. * * \implsparsesolverconcept @@ -41,15 +43,15 @@ namespace Eigen { * the info() method, then you can either increase the initial shift, or better use another preconditioning technique. * */ -template > -class IncompleteCholesky : public SparseSolverBase > +template > +class IncompleteCholesky : public SparseSolverBase > { protected: - typedef SparseSolverBase > Base; + typedef SparseSolverBase > Base; using Base::m_isInitialized; public: typedef typename NumTraits::Real RealScalar; - typedef _OrderingType OrderingType; + typedef OrderingType_ OrderingType; typedef typename OrderingType::PermutationType PermutationType; typedef typename PermutationType::StorageIndex StorageIndex; typedef SparseMatrix FactorType; @@ -57,7 +59,7 @@ class IncompleteCholesky : public SparseSolverBase VectorRx; typedef Matrix VectorIx; typedef std::vector > VectorList; - enum { UpLo = _UpLo }; + enum { UpLo = UpLo_ }; enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic @@ -160,13 +162,13 @@ class IncompleteCholesky : public SparseSolverBase -template -void IncompleteCholesky::factorize(const _MatrixType& mat) +template +template +void IncompleteCholesky::factorize(const MatrixType_& mat) { using std::sqrt; eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); @@ -199,12 +201,12 @@ void IncompleteCholesky::factorize(const _MatrixType { // The temporary is needed to make sure that the diagonal entry is properly sorted FactorType tmp(mat.rows(), mat.cols()); - tmp = mat.template selfadjointView<_UpLo>().twistedBy(m_perm); + tmp = mat.template selfadjointView().twistedBy(m_perm); m_L.template selfadjointView() = tmp.template selfadjointView(); } else { - m_L.template selfadjointView() = mat.template selfadjointView<_UpLo>(); + m_L.template selfadjointView() = mat.template selfadjointView(); } Index n = m_L.cols(); @@ -369,8 +371,8 @@ void IncompleteCholesky::factorize(const _MatrixType } while(m_info!=Success); } -template -inline void IncompleteCholesky::updateList(Ref colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol) +template +inline void IncompleteCholesky::updateList(Ref colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol) { if (jk < colPtr(col+1) ) { diff --git a/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index cdcf709..44f25fc 100644 --- a/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/libs/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -12,6 +12,8 @@ #define EIGEN_INCOMPLETE_LUT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -95,15 +97,15 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) * alternatively, on GMANE: * http://comments.gmane.org/gmane.comp.lib.eigen/3302 */ -template -class IncompleteLUT : public SparseSolverBase > +template +class IncompleteLUT : public SparseSolverBase > { protected: typedef SparseSolverBase Base; using Base::m_isInitialized; public: - typedef _Scalar Scalar; - typedef _StorageIndex StorageIndex; + typedef Scalar_ Scalar; + typedef StorageIndex_ StorageIndex; typedef typename NumTraits::Real RealScalar; typedef Matrix Vector; typedef Matrix VectorI; @@ -219,8 +221,8 @@ void IncompleteLUT::setFillfactor(int fillfactor) } template -template -void IncompleteLUT::analyzePattern(const _MatrixType& amat) +template +void IncompleteLUT::analyzePattern(const MatrixType_& amat) { // Compute the Fill-reducing permutation // Since ILUT does not perform any numerical pivoting, @@ -240,8 +242,8 @@ void IncompleteLUT::analyzePattern(const _MatrixType& amat) } template -template -void IncompleteLUT::factorize(const _MatrixType& amat) +template +void IncompleteLUT::factorize(const MatrixType_& amat) { using std::sqrt; using std::swap; diff --git a/libs/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h b/libs/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h new file mode 100644 index 0000000..b657e84 --- /dev/null +++ b/libs/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H +#error "Please include Eigen/IterativeLinearSolvers instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/libs/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 28a0c51..49829d0 100644 --- a/libs/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/libs/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -10,6 +10,8 @@ #ifndef EIGEN_ITERATIVE_SOLVER_BASE_H #define EIGEN_ITERATIVE_SOLVER_BASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -40,7 +42,7 @@ public: template struct is_ref_compatible { - enum { value = is_ref_compatible_impl::type>::value }; + enum { value = is_ref_compatible_impl>::value }; }; template::value> @@ -77,16 +79,16 @@ public: template void grab(const EigenBase &mat) { - m_matrix.~Ref(); - ::new (&m_matrix) Ref(mat.derived()); + internal::destroy_at(&m_matrix); + internal::construct_at(&m_matrix, mat.derived()); } void grab(const Ref &mat) { if(&(mat.derived()) != &m_matrix) { - m_matrix.~Ref(); - ::new (&m_matrix) Ref(mat); + internal::destroy_at(&m_matrix); + internal::construct_at(&m_matrix, mat); } } @@ -186,6 +188,9 @@ public: compute(matrix()); } + + IterativeSolverBase(IterativeSolverBase&&) = default; + ~IterativeSolverBase() {} /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems. @@ -295,7 +300,7 @@ public: /** \returns the number of iterations performed during the last solve */ Index iterations() const { - eigen_assert(m_isInitialized && "ConjugateGradient is not initialized."); + eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized."); return m_iterations; } @@ -304,7 +309,7 @@ public: */ RealScalar error() const { - eigen_assert(m_isInitialized && "ConjugateGradient is not initialized."); + eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized."); return m_error; } @@ -364,7 +369,7 @@ public: } template - typename internal::enable_if::type + std::enable_if_t _solve_with_guess_impl(const Rhs& b, MatrixBase &aDest) const { eigen_assert(rows()==b.rows()); @@ -389,7 +394,7 @@ public: } template - typename internal::enable_if::type + std::enable_if_t _solve_with_guess_impl(const Rhs& b, MatrixBase &dest) const { derived()._solve_vector_with_guess_impl(b,dest.derived()); diff --git a/libs/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/libs/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h index 203fd0e..a76f3f8 100644 --- a/libs/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +++ b/libs/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h @@ -10,6 +10,8 @@ #ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H #define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -73,7 +75,7 @@ void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest Scalar alpha = absNew / tmp.squaredNorm(); // the amount we travel on dir x += alpha * p; // update solution residual -= alpha * tmp; // update residual - normal_residual = mat.adjoint() * residual; // update residual of the normal equation + normal_residual.noalias() = mat.adjoint() * residual; // update residual of the normal equation residualNorm2 = normal_residual.squaredNorm(); if(residualNorm2 < threshold) @@ -93,17 +95,17 @@ void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest } -template< typename _MatrixType, - typename _Preconditioner = LeastSquareDiagonalPreconditioner > +template< typename MatrixType_, + typename Preconditioner_ = LeastSquareDiagonalPreconditioner > class LeastSquaresConjugateGradient; namespace internal { -template< typename _MatrixType, typename _Preconditioner> -struct traits > +template< typename MatrixType_, typename Preconditioner_> +struct traits > { - typedef _MatrixType MatrixType; - typedef _Preconditioner Preconditioner; + typedef MatrixType_ MatrixType; + typedef Preconditioner_ Preconditioner; }; } @@ -111,13 +113,13 @@ struct traits > /** \ingroup IterativeLinearSolvers_Module * \brief A conjugate gradient solver for sparse (or dense) least-square problems * - * This class allows to solve for A x = b linear problems using an iterative conjugate gradient algorithm. + * This class solves for the least-squares solution to A x = b using an iterative conjugate gradient algorithm. * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty stability. * Otherwise, the SparseLU or SparseQR classes might be preferable. * The matrix A and the vectors x and b can be either dense or sparse. * - * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix. - * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner + * \tparam MatrixType_ the type of the matrix A, can be a dense or a sparse matrix. + * \tparam Preconditioner_ the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner * * \implsparsesolverconcept * @@ -145,8 +147,8 @@ struct traits > * * \sa class ConjugateGradient, SparseLU, SparseQR */ -template< typename _MatrixType, typename _Preconditioner> -class LeastSquaresConjugateGradient : public IterativeSolverBase > +template< typename MatrixType_, typename Preconditioner_> +class LeastSquaresConjugateGradient : public IterativeSolverBase > { typedef IterativeSolverBase Base; using Base::matrix; @@ -155,10 +157,10 @@ class LeastSquaresConjugateGradient : public IterativeSolverBase class SolveWithGuess; @@ -83,7 +85,7 @@ struct evaluator > evaluator(const SolveType& solve) : m_result(solve.rows(), solve.cols()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); m_result = solve.guess(); solve.dec()._solve_with_guess_impl(solve.rhs(), m_result); } diff --git a/libs/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h b/libs/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h new file mode 100644 index 0000000..b17b1f2 --- /dev/null +++ b/libs/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_JACOBI_MODULE_H +#error "Please include Eigen/Jacobi instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/Jacobi/Jacobi.h b/libs/eigen/Eigen/src/Jacobi/Jacobi.h index 76668a5..5d96989 100644 --- a/libs/eigen/Eigen/src/Jacobi/Jacobi.h +++ b/libs/eigen/Eigen/src/Jacobi/Jacobi.h @@ -11,6 +11,8 @@ #ifndef EIGEN_JACOBI_H #define EIGEN_JACOBI_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup Jacobi_Module @@ -161,7 +163,7 @@ template EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { - makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); + makeGivens(p, q, r, std::conditional_t::IsComplex, internal::true_type, internal::false_type>()); } @@ -232,13 +234,13 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar { using std::sqrt; using std::abs; - if(q==Scalar(0)) + if(numext::is_exactly_zero(q)) { m_c = p::size, - OtherPacketSize = packet_traits::size - }; typedef typename packet_traits::type Packet; typedef typename packet_traits::type OtherPacket; + enum { + RequiredAlignment = plain_enum_max(unpacket_traits::alignment, + unpacket_traits::alignment), + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; + /*** dynamic-size vectorized paths ***/ - if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1)) + if(size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1)) { // both vectors are sequentially stored in memory => vectorization enum { Peeling = 2 }; @@ -421,11 +426,11 @@ struct apply_rotation_in_the_plane_selector0) // FIXME should be compared to the required alignment + else if(SizeAtCompileTime != Dynamic && MinAlignment >= RequiredAlignment) { const OtherPacket pc = pset1(c); const OtherPacket ps = pset1(s); - conj_helper::IsComplex,false> pcj; + conj_helper::IsComplex,false> pcj; conj_helper pm; Scalar* EIGEN_RESTRICT px = x; Scalar* EIGEN_RESTRICT py = y; @@ -450,11 +455,11 @@ struct apply_rotation_in_the_plane_selector EIGEN_DEVICE_FUNC -void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) +void inline apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - const bool Vectorizable = (int(VectorX::Flags) & int(VectorY::Flags) & PacketAccessBit) - && (int(packet_traits::size) == int(packet_traits::size)); + constexpr bool Vectorizable = (int(evaluator::Flags) & int(evaluator::Flags) & PacketAccessBit) && + (int(packet_traits::size) == int(packet_traits::size)); eigen_assert(xpr_x.size() == xpr_y.size()); Index size = xpr_x.size(); @@ -466,13 +471,13 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x OtherScalar c = j.c(); OtherScalar s = j.s(); - if (c==OtherScalar(1) && s==OtherScalar(0)) + if (numext::is_exactly_one(c) && numext::is_exactly_zero(s)) return; apply_rotation_in_the_plane_selector< Scalar,OtherScalar, VectorX::SizeAtCompileTime, - EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment), + plain_enum_min(evaluator::Alignment, evaluator::Alignment), Vectorizable>::run(x,incrx,y,incry,size,c,s); } diff --git a/libs/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h b/libs/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h new file mode 100644 index 0000000..eb1d671 --- /dev/null +++ b/libs/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_KLUSUPPORT_MODULE_H +#error "Please include Eigen/KLUSupport instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/KLUSupport/KLUSupport.h b/libs/eigen/Eigen/src/KLUSupport/KLUSupport.h index 215db35..bfe2f66 100644 --- a/libs/eigen/Eigen/src/KLUSupport/KLUSupport.h +++ b/libs/eigen/Eigen/src/KLUSupport/KLUSupport.h @@ -10,6 +10,8 @@ #ifndef EIGEN_KLUSUPPORT_H #define EIGEN_KLUSUPPORT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /* TODO extract L, extract U, compute det, etc... */ @@ -23,7 +25,7 @@ namespace Eigen { * * \warning The input matrix A should be in a \b compressed and \b column-major form. * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> * * \implsparsesolverconcept * @@ -56,15 +58,15 @@ inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex Ax[], kl } -template -class KLU : public SparseSolverBase > +template +class KLU : public SparseSolverBase > { protected: - typedef SparseSolverBase > Base; + typedef SparseSolverBase > Base; using Base::m_isInitialized; public: using Base::_solve_impl; - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; @@ -263,16 +265,16 @@ class KLU : public SparseSolverBase > template void grab(const EigenBase &A) { - mp_matrix.~KLUMatrixRef(); - ::new (&mp_matrix) KLUMatrixRef(A.derived()); + internal::destroy_at(&mp_matrix); + internal::construct_at(&mp_matrix, A.derived()); } void grab(const KLUMatrixRef &A) { if(&(A.derived()) != &mp_matrix) { - mp_matrix.~KLUMatrixRef(); - ::new (&mp_matrix) KLUMatrixRef(A); + internal::destroy_at(&mp_matrix); + internal::construct_at(&mp_matrix, A); } } diff --git a/libs/eigen/Eigen/src/LU/Determinant.h b/libs/eigen/Eigen/src/LU/Determinant.h index 3a41e6f..80e695d 100644 --- a/libs/eigen/Eigen/src/LU/Determinant.h +++ b/libs/eigen/Eigen/src/LU/Determinant.h @@ -10,6 +10,8 @@ #ifndef EIGEN_DETERMINANT_H #define EIGEN_DETERMINANT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -80,8 +82,8 @@ template struct determinant_impl Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02); Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01); Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01); - return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) + - internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3); + return internal::pmadd(static_cast(-m(0,3)),d3_0, static_cast(m(1,3)*d3_1)) + + internal::pmadd(static_cast(-m(2,3)),d3_2, static_cast(m(3,3)*d3_3)); } protected: static EIGEN_DEVICE_FUNC @@ -93,7 +95,7 @@ protected: static EIGEN_DEVICE_FUNC Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2) { - return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2)); + return internal::pmadd(m(i0,2), d0, internal::pmadd(static_cast(-m(i1,2)), d1, static_cast(m(i2,2)*d2))); } }; @@ -109,7 +111,7 @@ inline typename internal::traits::Scalar MatrixBase::determina { eigen_assert(rows() == cols()); typedef typename internal::nested_eval::type Nested; - return internal::determinant_impl::type>::run(derived()); + return internal::determinant_impl>::run(derived()); } } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/LU/FullPivLU.h b/libs/eigen/Eigen/src/LU/FullPivLU.h index ba1749f..259b549 100644 --- a/libs/eigen/Eigen/src/LU/FullPivLU.h +++ b/libs/eigen/Eigen/src/LU/FullPivLU.h @@ -10,11 +10,13 @@ #ifndef EIGEN_LU_H #define EIGEN_LU_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template struct traits > - : traits<_MatrixType> +template struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; @@ -30,7 +32,7 @@ template struct traits > * * \brief LU decomposition of a matrix with complete pivoting, and related features * - * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition + * \tparam MatrixType_ the type of the matrix of which we are computing the LU decomposition * * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is @@ -57,11 +59,11 @@ template struct traits > * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ -template class FullPivLU - : public SolverBase > +template class FullPivLU + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; friend class SolverBase; @@ -419,10 +421,7 @@ template class FullPivLU protected: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) void computeInPlace(); @@ -487,8 +486,6 @@ FullPivLU::FullPivLU(EigenBase& matrix) template void FullPivLU::computeInPlace() { - check_template_parameters(); - // the permutations are stored as int indices, so just to be sure: eigen_assert(m_lu.rows()<=NumTraits::highest() && m_lu.cols()<=NumTraits::highest()); @@ -522,7 +519,7 @@ void FullPivLU::computeInPlace() row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner, col_of_biggest_in_corner += k; // need to add k to them. - if(biggest_in_corner==Score(0)) + if(numext::is_exactly_zero(biggest_in_corner)) { // before exiting, make sure to initialize the still uninitialized transpositions // in a sane state without destroying what we already have. @@ -613,15 +610,15 @@ MatrixType FullPivLU::reconstructedMatrix() const /********* Implementation of kernel() **************************************************/ namespace internal { -template -struct kernel_retval > - : kernel_retval_base > +template +struct kernel_retval > + : kernel_retval_base > { - EIGEN_MAKE_KERNEL_HELPERS(FullPivLU<_MatrixType>) + EIGEN_MAKE_KERNEL_HELPERS(FullPivLU) - enum { MaxSmallDimAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED( - MatrixType::MaxColsAtCompileTime, - MatrixType::MaxRowsAtCompileTime) + enum { MaxSmallDimAtCompileTime = min_size_prefer_fixed( + MatrixType::MaxColsAtCompileTime, + MatrixType::MaxRowsAtCompileTime) }; template void evalTo(Dest& dst) const @@ -699,15 +696,15 @@ struct kernel_retval > /***** Implementation of image() *****************************************************/ -template -struct image_retval > - : image_retval_base > +template +struct image_retval > + : image_retval_base > { - EIGEN_MAKE_IMAGE_HELPERS(FullPivLU<_MatrixType>) + EIGEN_MAKE_IMAGE_HELPERS(FullPivLU) - enum { MaxSmallDimAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED( - MatrixType::MaxColsAtCompileTime, - MatrixType::MaxRowsAtCompileTime) + enum { MaxSmallDimAtCompileTime = min_size_prefer_fixed( + MatrixType::MaxColsAtCompileTime, + MatrixType::MaxRowsAtCompileTime) }; template void evalTo(Dest& dst) const @@ -740,9 +737,9 @@ struct image_retval > } // end namespace internal #ifndef EIGEN_PARSED_BY_DOXYGEN -template +template template -void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const +void FullPivLU::_solve_impl(const RhsType &rhs, DstType &dst) const { /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}. * So we proceed as follows: @@ -787,9 +784,9 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const dst.row(permutationQ().indices().coeff(i)).setZero(); } -template +template template -void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +void FullPivLU::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const { /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}, * and since permutations are real and unitary, we can write this diff --git a/libs/eigen/Eigen/src/LU/InternalHeaderCheck.h b/libs/eigen/Eigen/src/LU/InternalHeaderCheck.h new file mode 100644 index 0000000..f346b17 --- /dev/null +++ b/libs/eigen/Eigen/src/LU/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_LU_MODULE_H +#error "Please include Eigen/LU instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/LU/InverseImpl.h b/libs/eigen/Eigen/src/LU/InverseImpl.h index a40cefa..bcfe703 100644 --- a/libs/eigen/Eigen/src/LU/InverseImpl.h +++ b/libs/eigen/Eigen/src/LU/InverseImpl.h @@ -11,6 +11,8 @@ #ifndef EIGEN_INVERSE_IMPL_H #define EIGEN_INVERSE_IMPL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -309,13 +311,13 @@ struct Assignment, internal::assign_op4) || (extract_data(src.nestedExpression())!=extract_data(dst))) && "Aliasing problem detected in inverse(), you need to do inverse().eval() here."); typedef typename internal::nested_eval::type ActualXprType; - typedef typename internal::remove_all::type ActualXprTypeCleanded; + typedef internal::remove_all_t ActualXprTypeCleanded; ActualXprType actual_xpr(src.nestedExpression()); @@ -385,11 +387,11 @@ inline void MatrixBase::computeInverseAndDetWithCheck( eigen_assert(rows() == cols()); // for 2x2, it's worth giving a chance to avoid evaluating. // for larger sizes, evaluating has negligible cost and limits code size. - typedef typename internal::conditional< + typedef std::conditional_t< RowsAtCompileTime == 2, - typename internal::remove_all::type>::type, + internal::remove_all_t::type>, PlainObject - >::type MatrixType; + > MatrixType; internal::compute_inverse_and_det_with_check::run (derived(), absDeterminantThreshold, inverse, determinant, invertible); } diff --git a/libs/eigen/Eigen/src/LU/PartialPivLU.h b/libs/eigen/Eigen/src/LU/PartialPivLU.h index 34aed72..1377398 100644 --- a/libs/eigen/Eigen/src/LU/PartialPivLU.h +++ b/libs/eigen/Eigen/src/LU/PartialPivLU.h @@ -11,16 +11,18 @@ #ifndef EIGEN_PARTIALLU_H #define EIGEN_PARTIALLU_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template struct traits > - : traits<_MatrixType> +template struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; typedef int StorageIndex; - typedef traits<_MatrixType> BaseTraits; + typedef traits BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, CoeffReadCost = Dynamic @@ -46,7 +48,7 @@ struct enable_if_ref,Derived> { * * \brief LU decomposition of a matrix with partial pivoting, and related features * - * \tparam _MatrixType the type of the matrix of which we are computing the LU decomposition + * \tparam MatrixType_ the type of the matrix of which we are computing the LU decomposition * * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P @@ -73,12 +75,12 @@ struct enable_if_ref,Derived> { * * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU */ -template class PartialPivLU - : public SolverBase > +template class PartialPivLU + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; friend class SolverBase; @@ -265,10 +267,7 @@ template class PartialPivLU protected: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) void compute(); @@ -334,12 +333,12 @@ namespace internal { template struct partial_lu_impl { - static const int UnBlockedBound = 16; - static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; - static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; + static constexpr int UnBlockedBound = 16; + static constexpr bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; + static constexpr int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; // Remaining rows and columns at compile-time: - static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; - static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; + static constexpr int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; + static constexpr int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; typedef Matrix MatrixType; typedef Ref MatrixTypeRef; typedef Ref > BlockType; @@ -379,7 +378,7 @@ struct partial_lu_impl row_transpositions[k] = PivIndex(row_of_biggest_in_col); - if(biggest_in_corner != Score(0)) + if(!numext::is_exactly_zero(biggest_in_corner)) { if(k != row_of_biggest_in_col) { @@ -405,7 +404,7 @@ struct partial_lu_impl { Index k = endk; row_transpositions[k] = PivIndex(k); - if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1) + if (numext::is_exactly_zero(Scoring()(lu(k, k))) && first_zero_pivot == -1) first_zero_pivot = k; } @@ -515,7 +514,7 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t partial_lu_impl < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex, - EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)> + internal::min_size_prefer_fixed(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime)> ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); } @@ -524,8 +523,6 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t template void PartialPivLU::compute() { - check_template_parameters(); - // the row permutation is stored as int indices, so just to be sure: eigen_assert(m_lu.rows()::highest()); diff --git a/libs/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h b/libs/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h index 755168a..b636442 100644 --- a/libs/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +++ b/libs/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h @@ -33,48 +33,61 @@ #ifndef EIGEN_PARTIALLU_LAPACK_H #define EIGEN_PARTIALLU_LAPACK_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -/** \internal Specialization for the data types supported by LAPACKe */ +namespace lapacke_helpers { +// ------------------------------------------------------------------------------------------------------------------- +// Generic lapacke partial lu implementation that converts arguments and dispatches to the function above +// ------------------------------------------------------------------------------------------------------------------- -#define EIGEN_LAPACKE_LU_PARTPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \ -template \ -struct partial_lu_impl \ -{ \ - /* \internal performs the LU decomposition in-place of the matrix represented */ \ - static lapack_int blocked_lu(Index rows, Index cols, EIGTYPE* lu_data, Index luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \ - { \ - EIGEN_UNUSED_VARIABLE(maxBlockSize);\ - lapack_int matrix_order, first_zero_pivot; \ - lapack_int m, n, lda, *ipiv, info; \ - EIGTYPE* a; \ -/* Set up parameters for ?getrf */ \ - matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \ - lda = convert_index(luStride); \ - a = lu_data; \ - ipiv = row_transpositions; \ - m = convert_index(rows); \ - n = convert_index(cols); \ - nb_transpositions = 0; \ -\ - info = LAPACKE_##LAPACKE_PREFIX##getrf( matrix_order, m, n, (LAPACKE_TYPE*)a, lda, ipiv ); \ -\ - for(int i=0;i= 0); \ -/* something should be done with nb_transpositions */ \ -\ - first_zero_pivot = info; \ - return first_zero_pivot; \ - } \ +template +struct lapacke_partial_lu { + /** \internal performs the LU decomposition in-place of the matrix represented */ + static lapack_int blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, lapack_int* row_transpositions, + lapack_int& nb_transpositions, lapack_int maxBlockSize=256) + { + EIGEN_UNUSED_VARIABLE(maxBlockSize); + // Set up parameters for getrf + lapack_int matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; + lapack_int lda = to_lapack(luStride); + Scalar* a = lu_data; + lapack_int* ipiv = row_transpositions; + lapack_int m = to_lapack(rows); + lapack_int n = to_lapack(cols); + nb_transpositions = 0; + + lapack_int info = getrf(matrix_order, m, n, to_lapack(a), lda, ipiv ); + eigen_assert(info >= 0); + + for(int i=0; i \ +struct partial_lu_impl : public lapacke_helpers::lapacke_partial_lu {}; + +EIGEN_LAPACKE_PARTIAL_LU(double) +EIGEN_LAPACKE_PARTIAL_LU(float) +EIGEN_LAPACKE_PARTIAL_LU(std::complex) +EIGEN_LAPACKE_PARTIAL_LU(std::complex) + +#undef EIGEN_LAPACKE_PARTIAL_LU } // end namespace internal diff --git a/libs/eigen/Eigen/src/LU/arch/InverseSize4.h b/libs/eigen/Eigen/src/LU/arch/InverseSize4.h index a232ffc..25f4601 100644 --- a/libs/eigen/Eigen/src/LU/arch/InverseSize4.h +++ b/libs/eigen/Eigen/src/LU/arch/InverseSize4.h @@ -35,6 +35,15 @@ #ifndef EIGEN_INVERSE_SIZE_4_H #define EIGEN_INVERSE_SIZE_4_H +#include "../InternalHeaderCheck.h" + +#if EIGEN_COMP_GNUC_STRICT +// These routines requires bit manipulation of the sign, which is not compatible +// with fastmath. +#pragma GCC push_options +#pragma GCC optimize ("no-fast-math") +#endif + namespace Eigen { namespace internal @@ -48,7 +57,7 @@ struct compute_inverse_size4::Alignment, StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) }; - typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType; + typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject> ActualMatrixType; static void run(const MatrixType &mat, ResultType &result) { @@ -56,10 +65,10 @@ struct compute_inverse_size4(data); - Packet4f _L2 = ploadt(data + stride*4); - Packet4f _L3 = ploadt(data + stride*8); - Packet4f _L4 = ploadt(data + stride*12); + Packet4f L1 = ploadt(data); + Packet4f L2 = ploadt(data + stride*4); + Packet4f L3 = ploadt(data + stride*8); + Packet4f L4 = ploadt(data + stride*12); // Four 2x2 sub-matrices of the input matrix // input = [[A, B], @@ -68,17 +77,17 @@ struct compute_inverse_size4(1.0f), det); + Packet4f rd = preciprocal(det); // Four sub-matrices of the inverse Packet4f iA, iB, iC, iD; @@ -143,8 +152,8 @@ struct compute_inverse_size4(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; - const Packet4f p4f_sign_PNNP = ploadu(sign_mask); + EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f}; + const Packet4f p4f_sign_PNNP = pload(sign_mask); rd = pxor(rd, p4f_sign_PNNP); iA = pmul(iA, rd); iB = pmul(iB, rd); @@ -173,9 +182,9 @@ struct compute_inverse_size4::Alignment, StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) }; - typedef typename conditional<(MatrixType::Flags & LinearAccessBit), - MatrixType const &, - typename MatrixType::PlainObject>::type + typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), + MatrixType const &, + typename MatrixType::PlainObject> ActualMatrixType; static void run(const MatrixType &mat, ResultType &result) @@ -326,10 +335,10 @@ struct compute_inverse_size4(0x8000000000000000ull)}; - const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; - const Packet2d sign_PN = ploadu(sign_mask1); - const Packet2d sign_NP = ploadu(sign_mask2); + EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0}; + EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0}; + const Packet2d sign_PN = pload(sign_mask1); + const Packet2d sign_NP = pload(sign_mask2); d1 = pxor(rd, sign_PN); d2 = pxor(rd, sign_NP); @@ -348,4 +357,9 @@ struct compute_inverse_size4 class PastixLU; -template class PastixLLT; -template class PastixLDLT; +template class PastixLU; +template class PastixLLT; +template class PastixLDLT; namespace internal { template struct pastix_traits; - template - struct pastix_traits< PastixLU<_MatrixType> > + template + struct pastix_traits< PastixLU > { - typedef _MatrixType MatrixType; - typedef typename _MatrixType::Scalar Scalar; - typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::StorageIndex StorageIndex; + typedef MatrixType_ MatrixType; + typedef typename MatrixType_::Scalar Scalar; + typedef typename MatrixType_::RealScalar RealScalar; + typedef typename MatrixType_::StorageIndex StorageIndex; }; - template - struct pastix_traits< PastixLLT<_MatrixType,Options> > + template + struct pastix_traits< PastixLLT > { - typedef _MatrixType MatrixType; - typedef typename _MatrixType::Scalar Scalar; - typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::StorageIndex StorageIndex; + typedef MatrixType_ MatrixType; + typedef typename MatrixType_::Scalar Scalar; + typedef typename MatrixType_::RealScalar RealScalar; + typedef typename MatrixType_::StorageIndex StorageIndex; }; - template - struct pastix_traits< PastixLDLT<_MatrixType,Options> > + template + struct pastix_traits< PastixLDLT > { - typedef _MatrixType MatrixType; - typedef typename _MatrixType::Scalar Scalar; - typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::StorageIndex StorageIndex; + typedef MatrixType_ MatrixType; + typedef typename MatrixType_::Scalar Scalar; + typedef typename MatrixType_::RealScalar RealScalar; + typedef typename MatrixType_::StorageIndex StorageIndex; }; inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) @@ -134,8 +136,8 @@ class PastixBase : public SparseSolverBase public: using Base::_solve_impl; - typedef typename internal::pastix_traits::MatrixType _MatrixType; - typedef _MatrixType MatrixType; + typedef typename internal::pastix_traits::MatrixType MatrixType_; + typedef MatrixType_ MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; @@ -397,7 +399,7 @@ bool PastixBase::_solve_impl(const MatrixBase &b, MatrixBase &x * This interface can symmetrize the input matrix otherwise. * The vectors or matrices X and B can be either dense or sparse. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> * \tparam IsStrSym Indicates if the input matrix has a symmetric pattern, default is false * NOTE : Note that if the analysis and factorization phase are called separately, * the input matrix will be symmetrized at each call, hence it is advised to @@ -408,11 +410,11 @@ bool PastixBase::_solve_impl(const MatrixBase &b, MatrixBase &x * \sa \ref TutorialSparseSolverConcept, class SparseLU * */ -template -class PastixLU : public PastixBase< PastixLU<_MatrixType> > +template +class PastixLU : public PastixBase< PastixLU > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef PastixBase > Base; typedef typename Base::ColSpMatrix ColSpMatrix; typedef typename MatrixType::StorageIndex StorageIndex; @@ -520,16 +522,16 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> > * * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT */ -template -class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> > +template +class PastixLLT : public PastixBase< PastixLLT > { public: - typedef _MatrixType MatrixType; - typedef PastixBase > Base; + typedef MatrixType_ MatrixType; + typedef PastixBase > Base; typedef typename Base::ColSpMatrix ColSpMatrix; public: - enum { UpLo = _UpLo }; + enum { UpLo = UpLo_ }; PastixLLT() : Base() { init(); @@ -604,16 +606,16 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> > * * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT */ -template -class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> > +template +class PastixLDLT : public PastixBase< PastixLDLT > { public: - typedef _MatrixType MatrixType; - typedef PastixBase > Base; + typedef MatrixType_ MatrixType; + typedef PastixBase > Base; typedef typename Base::ColSpMatrix ColSpMatrix; public: - enum { UpLo = _UpLo }; + enum { UpLo = UpLo_ }; PastixLDLT():Base() { init(); diff --git a/libs/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h b/libs/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h new file mode 100644 index 0000000..8ef33f0 --- /dev/null +++ b/libs/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_PARDISOSUPPORT_MODULE_H +#error "Please include Eigen/PardisoSupport instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/PardisoSupport/PardisoSupport.h b/libs/eigen/Eigen/src/PardisoSupport/PardisoSupport.h index f89b79b..e9815e6 100644 --- a/libs/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/libs/eigen/Eigen/src/PardisoSupport/PardisoSupport.h @@ -32,11 +32,13 @@ #ifndef EIGEN_PARDISOSUPPORT_H #define EIGEN_PARDISOSUPPORT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { -template class PardisoLU; -template class PardisoLLT; -template class PardisoLDLT; +template class PardisoLU; +template class PardisoLLT; +template class PardisoLDLT; namespace internal { @@ -66,31 +68,31 @@ namespace internal template struct pardiso_traits; - template - struct pardiso_traits< PardisoLU<_MatrixType> > + template + struct pardiso_traits< PardisoLU > { - typedef _MatrixType MatrixType; - typedef typename _MatrixType::Scalar Scalar; - typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::StorageIndex StorageIndex; + typedef MatrixType_ MatrixType; + typedef typename MatrixType_::Scalar Scalar; + typedef typename MatrixType_::RealScalar RealScalar; + typedef typename MatrixType_::StorageIndex StorageIndex; }; - template - struct pardiso_traits< PardisoLLT<_MatrixType, Options> > + template + struct pardiso_traits< PardisoLLT > { - typedef _MatrixType MatrixType; - typedef typename _MatrixType::Scalar Scalar; - typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::StorageIndex StorageIndex; + typedef MatrixType_ MatrixType; + typedef typename MatrixType_::Scalar Scalar; + typedef typename MatrixType_::RealScalar RealScalar; + typedef typename MatrixType_::StorageIndex StorageIndex; }; - template - struct pardiso_traits< PardisoLDLT<_MatrixType, Options> > + template + struct pardiso_traits< PardisoLDLT > { - typedef _MatrixType MatrixType; - typedef typename _MatrixType::Scalar Scalar; - typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::StorageIndex StorageIndex; + typedef MatrixType_ MatrixType; + typedef typename MatrixType_::Scalar Scalar; + typedef typename MatrixType_::RealScalar RealScalar; + typedef typename MatrixType_::StorageIndex StorageIndex; }; } // end namespace internal @@ -375,7 +377,7 @@ void PardisoImpl::_solve_impl(const MatrixBase &b, MatrixBase * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set: * \code solver.pardisoParameterArray()[59] = 1; \endcode * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> * * \implsparsesolverconcept * @@ -437,21 +439,21 @@ class PardisoLU : public PardisoImpl< PardisoLU > * * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT */ -template -class PardisoLLT : public PardisoImpl< PardisoLLT > +template +class PardisoLLT : public PardisoImpl< PardisoLLT > { protected: - typedef PardisoImpl< PardisoLLT > Base; + typedef PardisoImpl< PardisoLLT > Base; using Base::pardisoInit; using Base::m_matrix; - friend class PardisoImpl< PardisoLLT >; + friend class PardisoImpl< PardisoLLT >; public: typedef typename Base::Scalar Scalar; typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; - enum { UpLo = _UpLo }; + enum { UpLo = UpLo_ }; using Base::compute; PardisoLLT() diff --git a/libs/eigen/Eigen/src/QR/ColPivHouseholderQR.h b/libs/eigen/Eigen/src/QR/ColPivHouseholderQR.h index 9b677e9..c906997 100644 --- a/libs/eigen/Eigen/src/QR/ColPivHouseholderQR.h +++ b/libs/eigen/Eigen/src/QR/ColPivHouseholderQR.h @@ -11,11 +11,13 @@ #ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_H #define EIGEN_COLPIVOTINGHOUSEHOLDERQR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template struct traits > - : traits<_MatrixType> +template struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; @@ -31,7 +33,7 @@ template struct traits > * * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting * - * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam MatrixType_ the type of the matrix of which we are computing the QR decomposition * * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R * such that @@ -48,12 +50,12 @@ template struct traits > * * \sa MatrixBase::colPivHouseholderQr() */ -template class ColPivHouseholderQR - : public SolverBase > +template class ColPivHouseholderQR + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; friend class SolverBase; @@ -67,7 +69,7 @@ template class ColPivHouseholderQR typedef typename internal::plain_row_type::type IntRowVectorType; typedef typename internal::plain_row_type::type RowVectorType; typedef typename internal::plain_row_type::type RealRowVectorType; - typedef HouseholderSequence::type> HouseholderSequenceType; + typedef HouseholderSequence> HouseholderSequenceType; typedef typename MatrixType::PlainObject PlainObject; private: @@ -217,6 +219,21 @@ template class ColPivHouseholderQR return m_colsPermutation; } + /** \returns the determinant of the matrix of which + * *this is the QR decomposition. It has only linear complexity + * (that is, O(n) where n is the dimension of the square matrix) + * as the QR decomposition has already been computed. + * + * \note This is only for square matrices. + * + * \warning a determinant can be very big or small, so for matrices + * of large enough dimension, there is a risk of overflow/underflow. + * One way to work around that is to use logAbsDeterminant() instead. + * + * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant() + */ + typename MatrixType::Scalar determinant() const; + /** \returns the absolute value of the determinant of the matrix of which * *this is the QR decomposition. It has only linear complexity * (that is, O(n) where n is the dimension of the square matrix) @@ -228,7 +245,7 @@ template class ColPivHouseholderQR * of large enough dimension, there is a risk of overflow/underflow. * One way to work around that is to use logAbsDeterminant() instead. * - * \sa logAbsDeterminant(), MatrixBase::determinant() + * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar absDeterminant() const; @@ -242,7 +259,7 @@ template class ColPivHouseholderQR * \note This method is useful to work around the risk of overflow/underflow that's inherent * to determinant computation. * - * \sa absDeterminant(), MatrixBase::determinant() + * \sa determinant(), absDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar logAbsDeterminant() const; @@ -426,10 +443,7 @@ template class ColPivHouseholderQR friend class CompleteOrthogonalDecomposition; - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) void computeInPlace(); @@ -443,9 +457,19 @@ template class ColPivHouseholderQR bool m_isInitialized, m_usePrescribedThreshold; RealScalar m_prescribedThreshold, m_maxpivot; Index m_nonzero_pivots; - Index m_det_pq; + Index m_det_p; }; +template +typename MatrixType::Scalar ColPivHouseholderQR::determinant() const +{ + eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); + eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!"); + Scalar detQ; + internal::householder_determinant::IsComplex>::run(m_hCoeffs, detQ); + return m_qr.diagonal().prod() * detQ * Scalar(m_det_p); +} + template typename MatrixType::RealScalar ColPivHouseholderQR::absDeterminant() const { @@ -481,8 +505,6 @@ ColPivHouseholderQR& ColPivHouseholderQR::compute(const template void ColPivHouseholderQR::computeInPlace() { - check_template_parameters(); - // the column permutation is stored as int indices, so just to be sure: eigen_assert(m_qr.cols()<=NumTraits::highest()); @@ -555,7 +577,7 @@ void ColPivHouseholderQR::computeInPlace() // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf // and used in LAPACK routines xGEQPF and xGEQP3. // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html - if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) { + if (!numext::is_exactly_zero(m_colNormsUpdated.coeffRef(j))) { RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); temp = temp < RealScalar(0) ? RealScalar(0) : temp; @@ -577,14 +599,14 @@ void ColPivHouseholderQR::computeInPlace() for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k) m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k))); - m_det_pq = (number_of_transpositions%2) ? -1 : 1; + m_det_p = (number_of_transpositions%2) ? -1 : 1; m_isInitialized = true; } #ifndef EIGEN_PARSED_BY_DOXYGEN -template +template template -void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const +void ColPivHouseholderQR::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index nonzero_pivots = nonzeroPivots(); @@ -606,9 +628,9 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero(); } -template +template template -void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +void ColPivHouseholderQR::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const { const Index nonzero_pivots = nonzeroPivots(); diff --git a/libs/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/libs/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h index 4e9651f..7652d31 100644 --- a/libs/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +++ b/libs/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h @@ -34,6 +34,8 @@ #ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H #define EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ diff --git a/libs/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/libs/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 486d337..02583a2 100644 --- a/libs/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/libs/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -10,12 +10,14 @@ #ifndef EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H #define EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template -struct traits > - : traits<_MatrixType> { +template +struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; typedef int StorageIndex; @@ -30,7 +32,7 @@ struct traits > * * \brief Complete orthogonal decomposition (COD) of a matrix. * - * \param MatrixType the type of the matrix of which we are computing the COD. + * \tparam MatrixType_ the type of the matrix of which we are computing the COD. * * This class performs a rank-revealing complete orthogonal decomposition of a * matrix \b A into matrices \b P, \b Q, \b T, and \b Z such that @@ -47,11 +49,11 @@ struct traits > * * \sa MatrixBase::completeOrthogonalDecomposition() */ -template class CompleteOrthogonalDecomposition - : public SolverBase > +template class CompleteOrthogonalDecomposition + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; template @@ -71,8 +73,8 @@ template class CompleteOrthogonalDecomposition typedef typename internal::plain_row_type::type RealRowVectorType; typedef HouseholderSequence< - MatrixType, typename internal::remove_all< - typename HCoeffsType::ConjugateReturnType>::type> + MatrixType, internal::remove_all_t< + typename HCoeffsType::ConjugateReturnType>> HouseholderSequenceType; typedef typename MatrixType::PlainObject PlainObject; @@ -177,7 +179,7 @@ template class CompleteOrthogonalDecomposition * \code matrixT().template triangularView() \endcode * For rank-deficient matrices, use * \code - * matrixR().topLeftCorner(rank(), rank()).template triangularView() + * matrixT().topLeftCorner(rank(), rank()).template triangularView() * \endcode */ const MatrixType& matrixT() const { return m_cpqr.matrixQR(); } @@ -195,6 +197,21 @@ template class CompleteOrthogonalDecomposition return m_cpqr.colsPermutation(); } + /** \returns the determinant of the matrix of which + * *this is the complete orthogonal decomposition. It has only linear + * complexity (that is, O(n) where n is the dimension of the square matrix) + * as the complete orthogonal decomposition has already been computed. + * + * \note This is only for square matrices. + * + * \warning a determinant can be very big or small, so for matrices + * of large enough dimension, there is a risk of overflow/underflow. + * One way to work around that is to use logAbsDeterminant() instead. + * + * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant() + */ + typename MatrixType::Scalar determinant() const; + /** \returns the absolute value of the determinant of the matrix of which * *this is the complete orthogonal decomposition. It has only linear * complexity (that is, O(n) where n is the dimension of the square matrix) @@ -206,7 +223,7 @@ template class CompleteOrthogonalDecomposition * of large enough dimension, there is a risk of overflow/underflow. * One way to work around that is to use logAbsDeterminant() instead. * - * \sa logAbsDeterminant(), MatrixBase::determinant() + * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar absDeterminant() const; @@ -221,7 +238,7 @@ template class CompleteOrthogonalDecomposition * \note This method is useful to work around the risk of overflow/underflow * that's inherent to determinant computation. * - * \sa absDeterminant(), MatrixBase::determinant() + * \sa determinant(), absDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar logAbsDeterminant() const; @@ -377,9 +394,7 @@ template class CompleteOrthogonalDecomposition #endif protected: - static void check_template_parameters() { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) template void _check_solve_assertion(const Rhs& b) const { @@ -407,6 +422,12 @@ template class CompleteOrthogonalDecomposition RowVectorType m_temp; }; +template +typename MatrixType::Scalar +CompleteOrthogonalDecomposition::determinant() const { + return m_cpqr.determinant(); +} + template typename MatrixType::RealScalar CompleteOrthogonalDecomposition::absDeterminant() const { @@ -429,8 +450,6 @@ CompleteOrthogonalDecomposition::logAbsDeterminant() const { template void CompleteOrthogonalDecomposition::computeInPlace() { - check_template_parameters(); - // the column permutation is stored as int indices, so just to be sure: eigen_assert(m_cpqr.cols() <= NumTraits::highest()); @@ -529,9 +548,9 @@ void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( } #ifndef EIGEN_PARSED_BY_DOXYGEN -template +template template -void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( +void CompleteOrthogonalDecomposition::_solve_impl( const RhsType& rhs, DstType& dst) const { const Index rank = this->rank(); if (rank == 0) { @@ -561,9 +580,9 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( dst = colsPermutation() * dst; } -template +template template -void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +void CompleteOrthogonalDecomposition::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const { const Index rank = this->rank(); diff --git a/libs/eigen/Eigen/src/QR/FullPivHouseholderQR.h b/libs/eigen/Eigen/src/QR/FullPivHouseholderQR.h index d0664a1..ec7e19b 100644 --- a/libs/eigen/Eigen/src/QR/FullPivHouseholderQR.h +++ b/libs/eigen/Eigen/src/QR/FullPivHouseholderQR.h @@ -11,12 +11,14 @@ #ifndef EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H #define EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template struct traits > - : traits<_MatrixType> +template struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; @@ -40,7 +42,7 @@ struct traits > * * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting * - * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam MatrixType_ the type of the matrix of which we are computing the QR decomposition * * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R * such that @@ -57,12 +59,12 @@ struct traits > * * \sa MatrixBase::fullPivHouseholderQr() */ -template class FullPivHouseholderQR - : public SolverBase > +template class FullPivHouseholderQR + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; friend class SolverBase; @@ -74,8 +76,8 @@ template class FullPivHouseholderQR typedef internal::FullPivHouseholderQRMatrixQReturnType MatrixQReturnType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef Matrix IntDiagSizeVectorType; + internal::min_size_prefer_dynamic(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1, + internal::min_size_prefer_fixed(MaxColsAtCompileTime, MaxRowsAtCompileTime)> IntDiagSizeVectorType; typedef PermutationMatrix PermutationType; typedef typename internal::plain_row_type::type RowVectorType; typedef typename internal::plain_col_type::type ColVectorType; @@ -208,6 +210,21 @@ template class FullPivHouseholderQR return m_rows_transpositions; } + /** \returns the determinant of the matrix of which + * *this is the QR decomposition. It has only linear complexity + * (that is, O(n) where n is the dimension of the square matrix) + * as the QR decomposition has already been computed. + * + * \note This is only for square matrices. + * + * \warning a determinant can be very big or small, so for matrices + * of large enough dimension, there is a risk of overflow/underflow. + * One way to work around that is to use logAbsDeterminant() instead. + * + * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant() + */ + typename MatrixType::Scalar determinant() const; + /** \returns the absolute value of the determinant of the matrix of which * *this is the QR decomposition. It has only linear complexity * (that is, O(n) where n is the dimension of the square matrix) @@ -219,7 +236,7 @@ template class FullPivHouseholderQR * of large enough dimension, there is a risk of overflow/underflow. * One way to work around that is to use logAbsDeterminant() instead. * - * \sa logAbsDeterminant(), MatrixBase::determinant() + * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar absDeterminant() const; @@ -233,7 +250,7 @@ template class FullPivHouseholderQR * \note This method is useful to work around the risk of overflow/underflow that's inherent * to determinant computation. * - * \sa absDeterminant(), MatrixBase::determinant() + * \sa determinant(), absDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar logAbsDeterminant() const; @@ -403,10 +420,7 @@ template class FullPivHouseholderQR protected: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) void computeInPlace(); @@ -420,9 +434,19 @@ template class FullPivHouseholderQR RealScalar m_prescribedThreshold, m_maxpivot; Index m_nonzero_pivots; RealScalar m_precision; - Index m_det_pq; + Index m_det_p; }; +template +typename MatrixType::Scalar FullPivHouseholderQR::determinant() const +{ + eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); + eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!"); + Scalar detQ; + internal::householder_determinant::IsComplex>::run(m_hCoeffs, detQ); + return m_qr.diagonal().prod() * detQ * Scalar(m_det_p); +} + template typename MatrixType::RealScalar FullPivHouseholderQR::absDeterminant() const { @@ -458,8 +482,6 @@ FullPivHouseholderQR& FullPivHouseholderQR::compute(cons template void FullPivHouseholderQR::computeInPlace() { - check_template_parameters(); - using std::abs; Index rows = m_qr.rows(); Index cols = m_qr.cols(); @@ -534,14 +556,14 @@ void FullPivHouseholderQR::computeInPlace() for(Index k = 0; k < size; ++k) m_cols_permutation.applyTranspositionOnTheRight(k, m_cols_transpositions.coeff(k)); - m_det_pq = (number_of_transpositions%2) ? -1 : 1; + m_det_p = (number_of_transpositions%2) ? -1 : 1; m_isInitialized = true; } #ifndef EIGEN_PARSED_BY_DOXYGEN -template +template template -void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const +void FullPivHouseholderQR::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index l_rank = rank(); @@ -573,9 +595,9 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero(); } -template +template template -void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +void FullPivHouseholderQR::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const { const Index l_rank = rank(); diff --git a/libs/eigen/Eigen/src/QR/HouseholderQR.h b/libs/eigen/Eigen/src/QR/HouseholderQR.h index 801739f..abfefd1 100644 --- a/libs/eigen/Eigen/src/QR/HouseholderQR.h +++ b/libs/eigen/Eigen/src/QR/HouseholderQR.h @@ -12,11 +12,13 @@ #ifndef EIGEN_QR_H #define EIGEN_QR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -template struct traits > - : traits<_MatrixType> +template struct traits > + : traits { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; @@ -33,7 +35,7 @@ template struct traits > * * \brief Householder QR decomposition of a matrix * - * \tparam _MatrixType the type of the matrix of which we are computing the QR decomposition + * \tparam MatrixType_ the type of the matrix of which we are computing the QR decomposition * * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R * such that @@ -53,12 +55,12 @@ template struct traits > * * \sa MatrixBase::householderQr() */ -template class HouseholderQR - : public SolverBase > +template class HouseholderQR + : public SolverBase > { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef SolverBase Base; friend class SolverBase; @@ -70,7 +72,7 @@ template class HouseholderQR typedef Matrix MatrixQType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef typename internal::plain_row_type::type RowVectorType; - typedef HouseholderSequence::type> HouseholderSequenceType; + typedef HouseholderSequence> HouseholderSequenceType; /** * \brief Default Constructor. @@ -182,6 +184,21 @@ template class HouseholderQR return *this; } + /** \returns the determinant of the matrix of which + * *this is the QR decomposition. It has only linear complexity + * (that is, O(n) where n is the dimension of the square matrix) + * as the QR decomposition has already been computed. + * + * \note This is only for square matrices. + * + * \warning a determinant can be very big or small, so for matrices + * of large enough dimension, there is a risk of overflow/underflow. + * One way to work around that is to use logAbsDeterminant() instead. + * + * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant() + */ + typename MatrixType::Scalar determinant() const; + /** \returns the absolute value of the determinant of the matrix of which * *this is the QR decomposition. It has only linear complexity * (that is, O(n) where n is the dimension of the square matrix) @@ -193,7 +210,7 @@ template class HouseholderQR * of large enough dimension, there is a risk of overflow/underflow. * One way to work around that is to use logAbsDeterminant() instead. * - * \sa logAbsDeterminant(), MatrixBase::determinant() + * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar absDeterminant() const; @@ -207,7 +224,7 @@ template class HouseholderQR * \note This method is useful to work around the risk of overflow/underflow that's inherent * to determinant computation. * - * \sa absDeterminant(), MatrixBase::determinant() + * \sa determinant(), absDeterminant(), MatrixBase::determinant() */ typename MatrixType::RealScalar logAbsDeterminant() const; @@ -230,10 +247,7 @@ template class HouseholderQR protected: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) void computeInPlace(); @@ -243,6 +257,57 @@ template class HouseholderQR bool m_isInitialized; }; +namespace internal { + +/** \internal */ +template +struct householder_determinant +{ + static void run(const HCoeffs& hCoeffs, Scalar& out_det) + { + out_det = Scalar(1); + Index size = hCoeffs.rows(); + for (Index i = 0; i < size; i ++) + { + // For each valid reflection Q_n, + // det(Q_n) = - conj(h_n) / h_n + // where h_n is the Householder coefficient. + if (hCoeffs(i) != Scalar(0)) + out_det *= - numext::conj(hCoeffs(i)) / hCoeffs(i); + } + } +}; + +/** \internal */ +template +struct householder_determinant +{ + static void run(const HCoeffs& hCoeffs, Scalar& out_det) + { + bool negated = false; + Index size = hCoeffs.rows(); + for (Index i = 0; i < size; i ++) + { + // Each valid reflection negates the determinant. + if (hCoeffs(i) != Scalar(0)) + negated ^= true; + } + out_det = negated ? Scalar(-1) : Scalar(1); + } +}; + +} // end namespace internal + +template +typename MatrixType::Scalar HouseholderQR::determinant() const +{ + eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); + eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!"); + Scalar detQ; + internal::householder_determinant::IsComplex>::run(m_hCoeffs, detQ); + return m_qr.diagonal().prod() * detQ; +} + template typename MatrixType::RealScalar HouseholderQR::absDeterminant() const { @@ -297,6 +362,43 @@ void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename } } +// TODO: add a corresponding public API for updating a QR factorization +/** \internal + * Basically a modified copy of @c Eigen::internal::householder_qr_inplace_unblocked that + * performs a rank-1 update of the QR matrix in compact storage. This function assumes, that + * the first @c k-1 columns of the matrix @c mat contain the QR decomposition of \f$A^N\f$ up to + * column k-1. Then the QR decomposition of the k-th column (given by @c newColumn) is computed by + * applying the k-1 Householder projectors on it and finally compute the projector \f$H_k\f$ of + * it. On exit the matrix @c mat and the vector @c hCoeffs contain the QR decomposition of the + * first k columns of \f$A^N\f$. The \a tempData argument must point to at least mat.cols() scalars. */ +template +void householder_qr_inplace_update(MatrixQR& mat, HCoeffs& hCoeffs, const VectorQR& newColumn, + typename MatrixQR::Index k, typename MatrixQR::Scalar* tempData) { + typedef typename MatrixQR::Index Index; + typedef typename MatrixQR::RealScalar RealScalar; + Index rows = mat.rows(); + + eigen_assert(k < mat.cols()); + eigen_assert(k < rows); + eigen_assert(hCoeffs.size() == mat.cols()); + eigen_assert(newColumn.size() == rows); + eigen_assert(tempData); + + // Store new column in mat at column k + mat.col(k) = newColumn; + // Apply H = H_1...H_{k-1} on newColumn (skip if k=0) + for (Index i = 0; i < k; ++i) { + Index remainingRows = rows - i; + mat.col(k) + .tail(remainingRows) + .applyHouseholderOnTheLeft(mat.col(i).tail(remainingRows - 1), hCoeffs.coeffRef(i), tempData + i + 1); + } + // Construct Householder projector in-place in column k + RealScalar beta; + mat.col(k).tail(rows - k).makeHouseholderInPlace(hCoeffs.coeffRef(k), beta); + mat.coeffRef(k, k) = beta; +} + /** \internal */ template +template template -void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const +void HouseholderQR::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); @@ -374,9 +476,9 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c dst.bottomRows(cols()-rank).setZero(); } -template +template template -void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +void HouseholderQR::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); @@ -403,8 +505,6 @@ void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstT template void HouseholderQR::computeInPlace() { - check_template_parameters(); - Index rows = m_qr.rows(); Index cols = m_qr.cols(); Index size = (std::min)(rows,cols); diff --git a/libs/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h b/libs/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h index 1dc7d53..57c2f6a 100644 --- a/libs/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +++ b/libs/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h @@ -34,32 +34,41 @@ #ifndef EIGEN_QR_LAPACKE_H #define EIGEN_QR_LAPACKE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { -/** \internal Specialization for the data types supported by LAPACKe */ +namespace lapacke_helpers { -#define EIGEN_LAPACKE_QR_NOPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \ -template \ -struct householder_qr_inplace_blocked \ -{ \ - static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index = 32, \ - typename MatrixQR::Scalar* = 0) \ - { \ - lapack_int m = (lapack_int) mat.rows(); \ - lapack_int n = (lapack_int) mat.cols(); \ - lapack_int lda = (lapack_int) mat.outerStride(); \ - lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \ - LAPACKE_##LAPACKE_PREFIX##geqrf( matrix_order, m, n, (LAPACKE_TYPE*)mat.data(), lda, (LAPACKE_TYPE*)hCoeffs.data()); \ - hCoeffs.adjointInPlace(); \ - } \ +template +struct lapacke_hqr +{ + static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index = 32, typename MatrixQR::Scalar* = 0) + { + lapack_int m = to_lapack(mat.rows()); + lapack_int n = to_lapack(mat.cols()); + lapack_int lda = to_lapack(mat.outerStride()); + lapack_int matrix_order = lapack_storage_of(mat); + geqrf(matrix_order, m, n, to_lapack(mat.data()), lda, to_lapack(hCoeffs.data())); + hCoeffs.adjointInPlace(); + } }; -EIGEN_LAPACKE_QR_NOPIV(double, double, d) -EIGEN_LAPACKE_QR_NOPIV(float, float, s) -EIGEN_LAPACKE_QR_NOPIV(dcomplex, lapack_complex_double, z) -EIGEN_LAPACKE_QR_NOPIV(scomplex, lapack_complex_float, c) +} + +/** \internal Specialization for the data types supported by LAPACKe */ +#define EIGEN_LAPACKE_HH_QR(EIGTYPE) \ +template \ +struct householder_qr_inplace_blocked : public lapacke_helpers::lapacke_hqr {}; + +EIGEN_LAPACKE_HH_QR(double) +EIGEN_LAPACKE_HH_QR(float) +EIGEN_LAPACKE_HH_QR(std::complex) +EIGEN_LAPACKE_HH_QR(std::complex) + +#undef EIGEN_LAPACKE_HH_QR } // end namespace internal diff --git a/libs/eigen/Eigen/src/QR/InternalHeaderCheck.h b/libs/eigen/Eigen/src/QR/InternalHeaderCheck.h new file mode 100644 index 0000000..bf8df01 --- /dev/null +++ b/libs/eigen/Eigen/src/QR/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_QR_MODULE_H +#error "Please include Eigen/QR instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h b/libs/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h new file mode 100644 index 0000000..8d94ba4 --- /dev/null +++ b/libs/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_SPQRSUPPORT_MODULE_H +#error "Please include Eigen/SPQRSupport instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/libs/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 013c7ae..36e8ead 100644 --- a/libs/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/libs/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -11,6 +11,8 @@ #ifndef EIGEN_SUITESPARSEQRSUPPORT_H #define EIGEN_SUITESPARSEQRSUPPORT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template class SPQR; @@ -50,21 +52,21 @@ namespace Eigen { * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix. * NOTE : The Index type of R is always SuiteSparse_long. You can get it with SPQR::Index * - * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<> + * \tparam MatrixType_ The type of the sparse matrix A, must be a column-major SparseMatrix<> * * \implsparsesolverconcept * * */ -template -class SPQR : public SparseSolverBase > +template +class SPQR : public SparseSolverBase > { protected: - typedef SparseSolverBase > Base; + typedef SparseSolverBase > Base; using Base::m_isInitialized; public: - typedef typename _MatrixType::Scalar Scalar; - typedef typename _MatrixType::RealScalar RealScalar; + typedef typename MatrixType_::Scalar Scalar; + typedef typename MatrixType_::RealScalar RealScalar; typedef SuiteSparse_long StorageIndex ; typedef SparseMatrix MatrixType; typedef Map > PermutationType; @@ -90,7 +92,7 @@ class SPQR : public SparseSolverBase > cholmod_l_start(&m_cc); } - explicit SPQR(const _MatrixType& matrix) + explicit SPQR(const MatrixType_& matrix) : m_analysisIsOk(false), m_factorizationIsOk(false), m_isRUpToDate(false), @@ -122,7 +124,7 @@ class SPQR : public SparseSolverBase > std::free(m_HPinv); } - void compute(const _MatrixType& matrix) + void compute(const MatrixType_& matrix) { if(m_isInitialized) SPQR_free(); @@ -137,7 +139,7 @@ class SPQR : public SparseSolverBase > { RealScalar max2Norm = 0.0; for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm()); - if(max2Norm==RealScalar(0)) + if(numext::is_exactly_zero(max2Norm)) max2Norm = RealScalar(1); pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits::epsilon(); } @@ -258,12 +260,12 @@ class SPQR : public SparseSolverBase > int m_ordering; // Ordering method to use, see SPQR's manual int m_allow_tol; // Allow to use some tolerance during numerical factorization. RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero - mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format + mutable cholmod_sparse *m_cR = nullptr; // The sparse R factor in cholmod format mutable MatrixType m_R; // The sparse matrix R in Eigen format - mutable StorageIndex *m_E; // The permutation applied to columns - mutable cholmod_sparse *m_H; //The householder vectors - mutable StorageIndex *m_HPinv; // The row permutation of H - mutable cholmod_dense *m_HTau; // The Householder coefficients + mutable StorageIndex *m_E = nullptr; // The permutation applied to columns + mutable cholmod_sparse *m_H = nullptr; //The householder vectors + mutable StorageIndex *m_HPinv = nullptr; // The row permutation of H + mutable cholmod_dense *m_HTau = nullptr; // The Householder coefficients mutable Index m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters bool m_useDefaultThreshold; // Use default threshold diff --git a/libs/eigen/Eigen/src/SVD/BDCSVD.h b/libs/eigen/Eigen/src/SVD/BDCSVD.h index 17f8e44..a69fdca 100644 --- a/libs/eigen/Eigen/src/SVD/BDCSVD.h +++ b/libs/eigen/Eigen/src/SVD/BDCSVD.h @@ -1,9 +1,9 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. -// +// // We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD" // research report written by Ming Gu and Stanley C.Eisenstat -// The code variable names correspond to the names they used in their +// The code variable names correspond to the names they used in their // report // // Copyright (C) 2013 Gauthier Brun @@ -27,26 +27,50 @@ #define eigen_internal_assert(X) assert(X); #endif +#include "./InternalHeaderCheck.h" + +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE +#include +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE IOFormat bdcsvdfmt(8, 0, ", ", "\n", " [", "]"); #endif - -template class BDCSVD; + +template +class BDCSVD; namespace internal { -template -struct traits > - : traits<_MatrixType> -{ - typedef _MatrixType MatrixType; -}; +template +struct traits > : svd_traits { + typedef MatrixType_ MatrixType; +}; + +template +struct allocate_small_svd { + static void run(JacobiSVD& smallSvd, Index rows, Index cols, unsigned int computationOptions) { + (void)computationOptions; + smallSvd = JacobiSVD(rows, cols); + } +}; + +EIGEN_DIAGNOSTICS(push) +EIGEN_DISABLE_DEPRECATED_WARNING + +template +struct allocate_small_svd { + static void run(JacobiSVD& smallSvd, Index rows, Index cols, unsigned int computationOptions) { + smallSvd = JacobiSVD(rows, cols, computationOptions); + } +}; + +EIGEN_DIAGNOSTICS(pop) } // end namespace internal - - + /** \ingroup SVD_Module * * @@ -54,7 +78,14 @@ struct traits > * * \brief class Bidiagonal Divide and Conquer SVD * - * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition + * \tparam MatrixType_ the type of the matrix of which we are computing the SVD decomposition + * + * \tparam Options_ this optional parameter allows one to specify options for computing unitaries \a U and \a V. + * Possible values are #ComputeThinU, #ComputeThinV, #ComputeFullU, #ComputeFullV, and + * #DisableQRDecomposition. It is not possible to request both the thin and full version of \a U or + * \a V. By default, unitaries are not computed. BDCSVD uses R-Bidiagonalization to improve + * performance on tall and wide matrices. For backwards compatility, the option + * #DisableQRDecomposition can be used to disable this optimization. * * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization, * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD. @@ -69,35 +100,38 @@ struct traits > * * \sa class JacobiSVD */ -template -class BDCSVD : public SVDBase > -{ +template +class BDCSVD : public SVDBase > { typedef SVDBase Base; - + public: using Base::rows; using Base::cols; using Base::computeU; using Base::computeV; - - typedef _MatrixType MatrixType; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; + + typedef MatrixType_ MatrixType; + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; typedef typename NumTraits::Literal Literal; + typedef typename Base::Index Index; enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, - DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime), - MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, - MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime), - MatrixOptions = MatrixType::Options + Options = Options_, + QRDecomposition = Options & internal::QRPreconditionerBits, + ComputationOptions = Options & internal::ComputationOptionsBits, + RowsAtCompileTime = Base::RowsAtCompileTime, + ColsAtCompileTime = Base::ColsAtCompileTime, + DiagSizeAtCompileTime = Base::DiagSizeAtCompileTime, + MaxRowsAtCompileTime = Base::MaxRowsAtCompileTime, + MaxColsAtCompileTime = Base::MaxColsAtCompileTime, + MaxDiagSizeAtCompileTime = Base::MaxDiagSizeAtCompileTime, + MatrixOptions = Base::MatrixOptions }; typedef typename Base::MatrixUType MatrixUType; typedef typename Base::MatrixVType MatrixVType; typedef typename Base::SingularValuesType SingularValuesType; - + typedef Matrix MatrixX; typedef Matrix MatrixXr; typedef Matrix VectorType; @@ -114,70 +148,95 @@ public: BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {} + /** \brief Default Constructor with memory preallocation + * + * Like the default constructor but with preallocation of the internal data + * according to the specified problem size and \a Options template parameter. + * \sa BDCSVD() + */ + BDCSVD(Index rows, Index cols) : m_algoswap(16), m_numIters(0) { + allocate(rows, cols, internal::get_computation_options(Options)); + } /** \brief Default Constructor with memory preallocation * * Like the default constructor but with preallocation of the internal data - * according to the specified problem size. + * according to the specified problem size and the \a computationOptions. + * + * One \b cannot request unitiaries using both the \a Options template parameter + * and the constructor. If possible, prefer using the \a Options template parameter. + * + * \param computationOptions specifification for computing Thin/Full unitaries U/V * \sa BDCSVD() + * + * \deprecated Will be removed in the next major Eigen version. Options should + * be specified in the \a Options template parameter. */ - BDCSVD(Index rows, Index cols, unsigned int computationOptions = 0) - : m_algoswap(16), m_numIters(0) - { + EIGEN_DEPRECATED + BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) { + internal::check_svd_options_assertions(computationOptions, rows, cols); allocate(rows, cols, computationOptions); } - /** \brief Constructor performing the decomposition of given matrix. + /** \brief Constructor performing the decomposition of given matrix, using the custom options specified + * with the \a Options template paramter. * * \param matrix the matrix to decompose - * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed. - * By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, - * #ComputeFullV, #ComputeThinV. - * - * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not - * available with the (non - default) FullPivHouseholderQR preconditioner. */ - BDCSVD(const MatrixType& matrix, unsigned int computationOptions = 0) - : m_algoswap(16), m_numIters(0) - { - compute(matrix, computationOptions); + BDCSVD(const MatrixType& matrix) : m_algoswap(16), m_numIters(0) { + compute_impl(matrix, internal::get_computation_options(Options)); } - ~BDCSVD() - { + /** \brief Constructor performing the decomposition of given matrix using specified options + * for computing unitaries. + * + * One \b cannot request unitiaries using both the \a Options template parameter + * and the constructor. If possible, prefer using the \a Options template parameter. + * + * \param matrix the matrix to decompose + * \param computationOptions specifification for computing Thin/Full unitaries U/V + * + * \deprecated Will be removed in the next major Eigen version. Options should + * be specified in the \a Options template parameter. + */ + EIGEN_DEPRECATED + BDCSVD(const MatrixType& matrix, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) { + internal::check_svd_options_assertions(computationOptions, matrix.rows(), matrix.cols()); + compute_impl(matrix, computationOptions); } + + ~BDCSVD() {} + + /** \brief Method performing the decomposition of given matrix. Computes Thin/Full unitaries U/V if specified + * using the \a Options template parameter or the class constructor. + * + * \param matrix the matrix to decompose + */ + BDCSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); } - /** \brief Method performing the decomposition of given matrix using custom options. + /** \brief Method performing the decomposition of given matrix, as specified by + * the `computationOptions` parameter. * * \param matrix the matrix to decompose - * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed. - * By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, - * #ComputeFullV, #ComputeThinV. - * - * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not - * available with the (non - default) FullPivHouseholderQR preconditioner. + * \param computationOptions specify whether to compute Thin/Full unitaries U/V + * + * \deprecated Will be removed in the next major Eigen version. Options should + * be specified in the \a Options template parameter. */ - BDCSVD& compute(const MatrixType& matrix, unsigned int computationOptions); - - /** \brief Method performing the decomposition of given matrix using current options. - * - * \param matrix the matrix to decompose - * - * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int). - */ - BDCSVD& compute(const MatrixType& matrix) - { - return compute(matrix, this->m_computationOptions); + EIGEN_DEPRECATED + BDCSVD& compute(const MatrixType& matrix, unsigned int computationOptions) { + internal::check_svd_options_assertions(computationOptions, matrix.rows(), matrix.cols()); + return compute_impl(matrix, computationOptions); } - void setSwitchSize(int s) + void setSwitchSize(int s) { - eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3"); + eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3."); m_algoswap = s; } - + private: - void allocate(Index rows, Index cols, unsigned int computationOptions); + BDCSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions); void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift); void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V); void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals, ArrayRef shifts, ArrayRef mus); @@ -190,84 +249,107 @@ private: void copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naivev); void structured_update(Block A, const MatrixXr &B, Index n1); static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift); + template + void computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW, Index shift); -protected: + protected: + void allocate(Index rows, Index cols, unsigned int computationOptions); MatrixXr m_naiveU, m_naiveV; MatrixXr m_computed; Index m_nRec; ArrayXr m_workspace; ArrayXi m_workspaceI; int m_algoswap; - bool m_isTranspose, m_compU, m_compV; - - using Base::m_singularValues; - using Base::m_diagSize; - using Base::m_computeFullU; - using Base::m_computeFullV; + bool m_isTranspose, m_compU, m_compV, m_useQrDecomp; + JacobiSVD smallSvd; + HouseholderQR qrDecomp; + internal::UpperBidiagonalization bid; + MatrixX copyWorkspace; + MatrixX reducedTriangle; + + using Base::m_computationOptions; using Base::m_computeThinU; using Base::m_computeThinV; - using Base::m_matrixU; - using Base::m_matrixV; + using Base::m_diagSize; using Base::m_info; using Base::m_isInitialized; + using Base::m_matrixU; + using Base::m_matrixV; using Base::m_nonzeroSingularValues; + using Base::m_singularValues; -public: + public: int m_numIters; -}; //end class BDCSVD - +}; // end class BDCSVD // Method to allocate and initialize matrix and attributes -template -void BDCSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) -{ - m_isTranspose = (cols > rows); - +template +void BDCSVD::allocate(Index rows, Index cols, unsigned int computationOptions) { if (Base::allocate(rows, cols, computationOptions)) return; - + + if (cols < m_algoswap) + internal::allocate_small_svd::run(smallSvd, rows, cols, computationOptions); + m_computed = MatrixXr::Zero(m_diagSize + 1, m_diagSize ); m_compU = computeV(); m_compV = computeU(); + m_isTranspose = (cols > rows); if (m_isTranspose) std::swap(m_compU, m_compV); - + + // kMinAspectRatio is the crossover point that determines if we perform R-Bidiagonalization + // or bidiagonalize the input matrix directly. + // It is based off of LAPACK's dgesdd routine, which uses 11.0/6.0 + // we use a larger scalar to prevent a regression for relatively square matrices. + constexpr Index kMinAspectRatio = 4; + constexpr bool disableQrDecomp = static_cast(QRDecomposition) == static_cast(DisableQRDecomposition); + m_useQrDecomp = !disableQrDecomp && ((rows / kMinAspectRatio > cols) || (cols / kMinAspectRatio > rows)); + if (m_useQrDecomp) { + qrDecomp = HouseholderQR((std::max)(rows, cols), (std::min)(rows, cols)); + reducedTriangle = MatrixX(m_diagSize, m_diagSize); + } + + copyWorkspace = MatrixX(m_isTranspose ? cols : rows, m_isTranspose ? rows : cols); + bid = internal::UpperBidiagonalization(m_useQrDecomp ? m_diagSize : copyWorkspace.rows(), + m_useQrDecomp ? m_diagSize : copyWorkspace.cols()); + if (m_compU) m_naiveU = MatrixXr::Zero(m_diagSize + 1, m_diagSize + 1 ); else m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 ); - + if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize); - + m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3); m_workspaceI.resize(3*m_diagSize); -}// end allocate +} // end allocate -template -BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsigned int computationOptions) -{ +template +BDCSVD& BDCSVD::compute_impl(const MatrixType& matrix, + unsigned int computationOptions) { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "\n\n\n======================================================================================================================\n\n\n"; #endif - allocate(matrix.rows(), matrix.cols(), computationOptions); using std::abs; + allocate(matrix.rows(), matrix.cols(), computationOptions); + const RealScalar considerZero = (std::numeric_limits::min)(); - + //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return if(matrix.cols() < m_algoswap) { - // FIXME this line involves temporaries - JacobiSVD jsvd(matrix,computationOptions); + smallSvd.compute(matrix); m_isInitialized = true; - m_info = jsvd.info(); + m_info = smallSvd.info(); if (m_info == Success || m_info == NoConvergence) { - if(computeU()) m_matrixU = jsvd.matrixU(); - if(computeV()) m_matrixV = jsvd.matrixV(); - m_singularValues = jsvd.singularValues(); - m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); + if (computeU()) m_matrixU = smallSvd.matrixU(); + if (computeV()) m_matrixV = smallSvd.matrixV(); + m_singularValues = smallSvd.singularValues(); + m_nonzeroSingularValues = smallSvd.nonzeroSingularValues(); } return *this; } - + //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows RealScalar scale = matrix.cwiseAbs().template maxCoeff(); if (!(numext::isfinite)(scale)) { @@ -276,14 +358,23 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign return *this; } - if(scale==Literal(0)) scale = Literal(1); - MatrixX copy; - if (m_isTranspose) copy = matrix.adjoint()/scale; - else copy = matrix/scale; - - //**** step 1 - Bidiagonalization - // FIXME this line involves temporaries - internal::UpperBidiagonalization bid(copy); + if(numext::is_exactly_zero(scale)) scale = Literal(1); + + if (m_isTranspose) copyWorkspace = matrix.adjoint() / scale; + else copyWorkspace = matrix / scale; + + //**** step 1 - Bidiagonalization. + // If the problem is sufficiently rectangular, we perform R-Bidiagonalization: compute A = Q(R/0) + // and then bidiagonalize R. Otherwise, if the problem is relatively square, we + // bidiagonalize the input matrix directly. + if (m_useQrDecomp) { + qrDecomp.compute(copyWorkspace); + reducedTriangle = qrDecomp.matrixQR().topRows(m_diagSize); + reducedTriangle.template triangularView().setZero(); + bid.compute(reducedTriangle); + } else { + bid.compute(copyWorkspace); + } //**** step 2 - Divide & Conquer m_naiveU.setZero(); @@ -296,7 +387,7 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign m_isInitialized = true; return *this; } - + //**** step 3 - Copy singular values and vectors for (int i=0; i& BDCSVD::compute(const MatrixType& matrix, unsign } } -#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE -// std::cout << "m_naiveU\n" << m_naiveU << "\n\n"; -// std::cout << "m_naiveV\n" << m_naiveV << "\n\n"; -#endif + //**** step 4 - Finalize unitaries U and V if(m_isTranspose) copyUV(bid.householderV(), bid.householderU(), m_naiveV, m_naiveU); else copyUV(bid.householderU(), bid.householderV(), m_naiveU, m_naiveV); + if (m_useQrDecomp) { + if (m_isTranspose && computeV()) m_matrixV.applyOnTheLeft(qrDecomp.householderQ()); + else if (!m_isTranspose && computeU()) m_matrixU.applyOnTheLeft(qrDecomp.householderQ()); + } + m_isInitialized = true; return *this; -}// end compute +} // end compute - -template -template -void BDCSVD::copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naiveV) -{ +template +template +void BDCSVD::copyUV(const HouseholderU& householderU, const HouseholderV& householderV, + const NaiveU& naiveU, const NaiveV& naiveV) { // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa if (computeU()) { - Index Ucols = m_computeThinU ? m_diagSize : householderU.cols(); - m_matrixU = MatrixX::Identity(householderU.cols(), Ucols); + Index Ucols = m_computeThinU ? m_diagSize : rows(); + m_matrixU = MatrixX::Identity(rows(), Ucols); m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast().topLeftCorner(m_diagSize, m_diagSize); - householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer + // FIXME the following conditionals involve temporary buffers + if (m_useQrDecomp) m_matrixU.topLeftCorner(householderU.cols(), m_diagSize).applyOnTheLeft(householderU); + else m_matrixU.applyOnTheLeft(householderU); } if (computeV()) { - Index Vcols = m_computeThinV ? m_diagSize : householderV.cols(); - m_matrixV = MatrixX::Identity(householderV.cols(), Vcols); + Index Vcols = m_computeThinV ? m_diagSize : cols(); + m_matrixV = MatrixX::Identity(cols(), Vcols); m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast().topLeftCorner(m_diagSize, m_diagSize); - householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer + // FIXME the following conditionals involve temporary buffers + if (m_useQrDecomp) m_matrixV.topLeftCorner(householderV.cols(), m_diagSize).applyOnTheLeft(householderV); + else m_matrixV.applyOnTheLeft(householderV); } } @@ -356,9 +452,8 @@ void BDCSVD::copyUV(const HouseholderU &householderU, const Househol * We can thus pack them prior to the the matrix product. However, this is only worth the effort if the matrix is large * enough. */ -template -void BDCSVD::structured_update(Block A, const MatrixXr &B, Index n1) -{ +template +void BDCSVD::structured_update(Block A, const MatrixXr& B, Index n1) { Index n = A.rows(); if(n>100) { @@ -385,7 +480,7 @@ void BDCSVD::structured_update(Block A, co ++k2; } } - + A.topRows(n1).noalias() = A1.leftCols(k1) * B1.topRows(k1); A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2); } @@ -397,19 +492,37 @@ void BDCSVD::structured_update(Block A, co } } -// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the -// place of the submatrix we are currently working on. +template +template +void BDCSVD::computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, + Index firstColW, Index shift) { + svd.compute(m_computed.block(firstCol, firstCol, n + 1, n)); + m_info = svd.info(); + if (m_info != Success && m_info != NoConvergence) return; + if (m_compU) + m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = svd.matrixU(); + else { + m_naiveU.row(0).segment(firstCol, n + 1).real() = svd.matrixU().row(0); + m_naiveU.row(1).segment(firstCol, n + 1).real() = svd.matrixU().row(n); + } + if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n).real() = svd.matrixV(); + m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero(); + m_computed.diagonal().segment(firstCol + shift, n) = svd.singularValues().head(n); +} + +// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods +// takes as argument the place of the submatrix we are currently working on. //@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU; -//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; +//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; // lastCol + 1 - firstCol is the size of the submatrix. //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W) -//@param firstRowW : Same as firstRowW with the column. -//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix +//@param firstColW : Same as firstRowW with the column. +//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. -template -void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) -{ +template +void BDCSVD::divide(Index firstCol, Index lastCol, Index firstRowW, + Index firstColW, Index shift) { // requires rows = cols + 1; using std::pow; using std::sqrt; @@ -418,36 +531,30 @@ void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig const Index k = n/2; const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar alphaK; - RealScalar betaK; - RealScalar r0; + RealScalar betaK; + RealScalar r0; RealScalar lambda, phi, c0, s0; VectorType l, f; - // We use the other algorithm which is more efficient for small + // We use the other algorithm which is more efficient for small // matrices. if (n < m_algoswap) { - // FIXME this line involves temporaries - JacobiSVD b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)); - m_info = b.info(); - if (m_info != Success && m_info != NoConvergence) return; - if (m_compU) - m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU(); - else - { - m_naiveU.row(0).segment(firstCol, n + 1).real() = b.matrixU().row(0); - m_naiveU.row(1).segment(firstCol, n + 1).real() = b.matrixU().row(n); + // FIXME this block involves temporaries + if (m_compV) { + JacobiSVD baseSvd; + computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift); + } else { + JacobiSVD baseSvd; + computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift); } - if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n).real() = b.matrixV(); - m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero(); - m_computed.diagonal().segment(firstCol + shift, n) = b.singularValues().head(n); return; } // We use the divide and conquer algorithm alphaK = m_computed(firstCol + k, firstCol + k); betaK = m_computed(firstCol + k + 1, firstCol + k); // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices - // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the - // right submatrix before the left one. + // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the + // right submatrix before the left one. divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift); if (m_info != Success && m_info != NoConvergence) return; divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1); @@ -457,8 +564,8 @@ void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig { lambda = m_naiveU(firstCol + k, firstCol + k); phi = m_naiveU(firstCol + k + 1, lastCol + 1); - } - else + } + else { lambda = m_naiveU(1, firstCol + k); phi = m_naiveU(0, lastCol + 1); @@ -468,8 +575,8 @@ void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig { l = m_naiveU.row(firstCol + k).segment(firstCol, k); f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1); - } - else + } + else { l = m_naiveU.row(1).segment(firstCol, k); f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1); @@ -485,52 +592,52 @@ void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig c0 = alphaK * lambda / r0; s0 = betaK * phi / r0; } - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(m_naiveU.allFinite()); - assert(m_naiveV.allFinite()); - assert(m_computed.allFinite()); + eigen_internal_assert(m_naiveU.allFinite()); + eigen_internal_assert(m_naiveV.allFinite()); + eigen_internal_assert(m_computed.allFinite()); #endif - + if (m_compU) { - MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1)); + MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1)); // we shiftW Q1 to the right - for (Index i = firstCol + k - 1; i >= firstCol; i--) + for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1); // we shift q1 at the left with a factor c0 m_naiveU.col(firstCol).segment( firstCol, k + 1) = (q1 * c0); // last column = q1 * - s0 m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * ( - s0)); // first column = q2 * s0 - m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0; + m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0; // q2 *= c0 m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0; - } - else + } + else { RealScalar q1 = m_naiveU(0, firstCol + k); // we shift Q1 to the right - for (Index i = firstCol + k - 1; i >= firstCol; i--) + for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU(0, i + 1) = m_naiveU(0, i); // we shift q1 at the left with a factor c0 m_naiveU(0, firstCol) = (q1 * c0); // last column = q1 * - s0 m_naiveU(0, lastCol + 1) = (q1 * ( - s0)); // first column = q2 * s0 - m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; + m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; // q2 *= c0 m_naiveU(1, lastCol + 1) *= c0; m_naiveU.row(1).segment(firstCol + 1, k).setZero(); m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero(); } - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(m_naiveU.allFinite()); - assert(m_naiveV.allFinite()); - assert(m_computed.allFinite()); + eigen_internal_assert(m_naiveU.allFinite()); + eigen_internal_assert(m_naiveV.allFinite()); + eigen_internal_assert(m_computed.allFinite()); #endif - + m_computed(firstCol + shift, firstCol + shift) = r0; m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real(); m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real(); @@ -547,21 +654,21 @@ void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig std::cout << "err: " << ((tmp1-tmp2).abs()>1e-12*tmp2.abs()).transpose() << "\n"; static int count = 0; std::cout << "# " << ++count << "\n\n"; - assert((tmp1-tmp2).matrix().norm() < 1e-14*tmp2.matrix().norm()); -// assert(count<681); -// assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all()); + eigen_internal_assert((tmp1-tmp2).matrix().norm() < 1e-14*tmp2.matrix().norm()); +// eigen_internal_assert(count<681); +// eigen_internal_assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all()); #endif - + // Third part: compute SVD of combined matrix MatrixXr UofSVD, VofSVD; VectorType singVals; computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD); - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(UofSVD.allFinite()); - assert(VofSVD.allFinite()); + eigen_internal_assert(UofSVD.allFinite()); + eigen_internal_assert(VofSVD.allFinite()); #endif - + if (m_compU) structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2); else @@ -570,18 +677,18 @@ void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD; m_naiveU.middleCols(firstCol, n + 1) = tmp; } - + if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2); - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(m_naiveU.allFinite()); - assert(m_naiveV.allFinite()); - assert(m_computed.allFinite()); + eigen_internal_assert(m_naiveU.allFinite()); + eigen_internal_assert(m_naiveV.allFinite()); + eigen_internal_assert(m_computed.allFinite()); #endif - + m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero(); m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals; -}// end divide +} // end divide // Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in // the first column and on the diagonal and has undergone deflation, so diagonal is in increasing @@ -591,9 +698,9 @@ void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eig // TODO Opportunities for optimization: better root finding algo, better stopping criterion, better // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf -template -void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) -{ +template +void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, + VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits::min)(); using std::abs; ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); @@ -610,18 +717,21 @@ void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma if (col0.hasNaN() || diag.hasNaN()) std::cout << "\n\nHAS NAN\n\n"; #endif - + // Many singular values might have been deflated, the zero ones have been moved to the end, // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); } + while(actual_n>1 && numext::is_exactly_zero(diag(actual_n - 1))) { + --actual_n; + eigen_internal_assert(numext::is_exactly_zero(col0(actual_n))); + } Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) m_workspaceI(m++) = k; Map perm(m_workspaceI.data(),m); - + Map shifts(m_workspace.data()+1*n, n); Map mus(m_workspace.data()+2*n, n); Map zhat(m_workspace.data()+3*n, n); @@ -631,58 +741,58 @@ void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma std::cout << " z: " << col0.transpose() << "\n"; std::cout << " d: " << diag.transpose() << "\n"; #endif - + // Compute singVals, shifts, and mus computeSingVals(col0, diag, perm, singVals, shifts, mus); - + #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << " j: " << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse() << "\n\n"; std::cout << " sing-val: " << singVals.transpose() << "\n"; std::cout << " mu: " << mus.transpose() << "\n"; std::cout << " shift: " << shifts.transpose() << "\n"; - + { std::cout << "\n\n mus: " << mus.head(actual_n).transpose() << "\n\n"; std::cout << " check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n"; - assert((((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n) >= 0).all()); + eigen_internal_assert((((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n) >= 0).all()); std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; - assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); + eigen_internal_assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); } #endif - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(singVals.allFinite()); - assert(mus.allFinite()); - assert(shifts.allFinite()); + eigen_internal_assert(singVals.allFinite()); + eigen_internal_assert(mus.allFinite()); + eigen_internal_assert(shifts.allFinite()); #endif - + // Compute zhat perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat); #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << " zhat: " << zhat.transpose() << "\n"; #endif - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(zhat.allFinite()); + eigen_internal_assert(zhat.allFinite()); #endif - + computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V); - + #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() << "\n"; std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() << "\n"; #endif - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(m_naiveU.allFinite()); - assert(m_naiveV.allFinite()); - assert(m_computed.allFinite()); - assert(U.allFinite()); - assert(V.allFinite()); -// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); -// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); + eigen_internal_assert(m_naiveU.allFinite()); + eigen_internal_assert(m_naiveV.allFinite()); + eigen_internal_assert(m_computed.allFinite()); + eigen_internal_assert(U.allFinite()); + eigen_internal_assert(V.allFinite()); +// eigen_internal_assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); +// eigen_internal_assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); #endif - + // Because of deflation, the singular values might not be completely sorted. // Fortunately, reordering them is a O(n) problem for(Index i=0; i::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all(); if(!singular_values_sorted) std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n"; - assert(singular_values_sorted); + eigen_internal_assert(singular_values_sorted); } #endif - + // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end singVals.head(actual_n).reverseInPlace(); U.leftCols(actual_n).rowwise().reverseInPlace(); if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace(); - + #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE JacobiSVD jsvd(m_computed.block(firstCol, firstCol, n, n) ); std::cout << " * j: " << jsvd.singularValues().transpose() << "\n\n"; @@ -719,9 +829,10 @@ void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma #endif } -template -typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift) -{ +template +typename BDCSVD::RealScalar BDCSVD::secularEq( + RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const ArrayRef& diagShifted, + RealScalar shift) { Index m = perm.size(); RealScalar res = Literal(1); for(Index i=0; i::RealScalar BDCSVD::secularEq(RealScalar res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu)); } return res; - } -template -void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, - VectorType& singVals, ArrayRef shifts, ArrayRef mus) -{ +template +void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, + VectorType& singVals, ArrayRef shifts, ArrayRef mus) { using std::abs; using std::swap; using std::sqrt; @@ -747,11 +856,11 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d Index actual_n = n; // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value. - while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n; + while(actual_n>1 && numext::is_exactly_zero(col0(actual_n - 1))) --actual_n; for (Index k = 0; k < n; ++k) { - if (col0(k) == Literal(0) || actual_n==1) + if (numext::is_exactly_zero(col0(k)) || actual_n == 1) { // if col0(k) == 0, then entry is deflated, so singular value is on diagonal // if actual_n==1, then the deflated problem is already diagonalized @@ -759,7 +868,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d mus(k) = Literal(0); shifts(k) = k==0 ? col0(0) : diag(k); continue; - } + } // otherwise, use secular equation to find singular value RealScalar left = diag(k); @@ -772,7 +881,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside. // This should be equivalent to using perm[] Index l = k+1; - while(col0(l)==Literal(0)) { ++l; eigen_internal_assert(l::computeSingVals(const ArrayRef& col0, const ArrayRef& d << " " << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n"; #endif RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; - + // measure everything relative to shift Map diagShifted(m_workspace.data()+4*n, n); diagShifted = diag - shift; @@ -807,7 +916,8 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d { // check that after the shift, f(mid) is still negative: RealScalar midShifted = (right - left) / RealScalar(2); - if(shift==right) + // we can test exact equality here, because shift comes from `... ? left : right` + if(numext::equal_strict(shift, right)) midShifted = -midShifted; RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift); if(fMidShifted>0) @@ -817,10 +927,11 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d diagShifted = diag - shift; } } - + // initial guess RealScalar muPrev, muCur; - if (shift == left) + // we can test exact equality here, because shift comes from `... ? left : right` + if (numext::equal_strict(shift, left)) { muPrev = (right - left) * RealScalar(0.1); if (k == actual_n-1) muCur = right - left; @@ -843,7 +954,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // rational interpolation: fit a function of the form a / mu + b through the two previous // iterates and use its zero to compute the next iterate bool useBisection = fPrev*fCur>Literal(0); - while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) + while (!numext::is_exactly_zero(fCur) && abs(muCur - muPrev) > Literal(8) * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev) > NumTraits::epsilon() && !useBisection) { ++m_numIters; @@ -855,16 +966,17 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift); #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert((numext::isfinite)(fZero)); + eigen_internal_assert((numext::isfinite)(fZero)); #endif - + muPrev = muCur; fPrev = fCur; muCur = muZero; fCur = fZero; - - if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; - if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; + + // we can test exact equality here, because shift comes from `... ? left : right` + if (numext::equal_strict(shift, left) && (muCur < Literal(0) || muCur > right - left)) useBisection = true; + if (numext::equal_strict(shift, right) && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; } @@ -875,7 +987,8 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d std::cout << "useBisection for k = " << k << ", actual_n = " << actual_n << "\n"; #endif RealScalar leftShifted, rightShifted; - if (shift == left) + // we can test exact equality here, because shift comes from `... ? left : right` + if (numext::equal_strict(shift, left)) { // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)), // the factor 2 is to be more conservative @@ -899,20 +1012,20 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); eigen_internal_assert(fLeft::computeSingVals(const ArrayRef& col0, const ArrayRef& d } muCur = (leftShifted + rightShifted) / Literal(2); } - else + else { // We have a problem as shifting on the left or right give either a positive or negative value // at the middle of [left,right]... // Instead fo abbording or entering an infinite loop, // let's just use the middle as the estimated zero-crossing: muCur = (right - left) * RealScalar(0.5); - if(shift == right) + // we can test exact equality here, because shift comes from `... ? left : right` + if(numext::equal_strict(shift, right)) muCur = -muCur; } } - + singVals[k] = shift + muCur; shifts[k] = shift; mus[k] = muCur; @@ -967,25 +1081,23 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. " << diag(k+1) << "\n"; #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(k==0 || singVals[k]>=singVals[k-1]); - assert(singVals[k]>=diag(k)); + eigen_internal_assert(k==0 || singVals[k]>=singVals[k-1]); + eigen_internal_assert(singVals[k]>=diag(k)); #endif // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - -// if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); -// if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); + // if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); + // if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); } } - // zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1) -template -void BDCSVD::perturbCol0 - (const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals, - const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat) -{ +template +void BDCSVD::perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, + const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, + ArrayRef zhat) { using std::sqrt; Index n = col0.size(); Index m = perm.size(); @@ -998,7 +1110,7 @@ void BDCSVD::perturbCol0 // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { - if (col0(k) == Literal(0)) // deflated + if (numext::is_exactly_zero(col0(k))) // deflated zhat(k) = Literal(0); else { @@ -1011,7 +1123,7 @@ void BDCSVD::perturbCol0 std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n"; std::cout << " = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n"; } - assert(prod>=0); + eigen_internal_assert(prod>=0); #endif for(Index l = 0; l::perturbCol0 std::cout << " " << "j=" << j << "\n"; } #endif - Index j = i= k && l == 0) { + m_info = NumericalIssue; + prod = 0; + break; + } + Index j = i 0 ? perm(l-1) : i; #ifdef EIGEN_BDCSVD_SANITY_CHECKS if(!(dk!=Literal(0) || diag(i)!=Literal(0))) { std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n"; } - assert(dk!=Literal(0) || diag(i)!=Literal(0)); + eigen_internal_assert(dk!=Literal(0) || diag(i)!=Literal(0)); #endif prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk))); #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(prod>=0); + eigen_internal_assert(prod>=0); #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) @@ -1053,7 +1172,7 @@ void BDCSVD::perturbCol0 #endif RealScalar tmp = sqrt(prod); #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert((numext::isfinite)(tmp)); + eigen_internal_assert((numext::isfinite)(tmp)); #endif zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } @@ -1061,17 +1180,16 @@ void BDCSVD::perturbCol0 } // compute singular vectors -template -void BDCSVD::computeSingVecs - (const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals, - const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V) -{ +template +void BDCSVD::computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, + const VectorType& singVals, const ArrayRef& shifts, + const ArrayRef& mus, MatrixXr& U, MatrixXr& V) { Index n = zhat.size(); Index m = perm.size(); - + for (Index k = 0; k < n; ++k) { - if (zhat(k) == Literal(0)) + if (numext::is_exactly_zero(zhat(k))) { U.col(k) = VectorType::Unit(n+1, k); if (m_compV) V.col(k) = VectorType::Unit(n, k); @@ -1086,7 +1204,7 @@ void BDCSVD::computeSingVecs } U(n,k) = Literal(0); U.col(k).normalize(); - + if (m_compV) { V.col(k).setZero(); @@ -1103,13 +1221,12 @@ void BDCSVD::computeSingVecs U.col(n) = VectorType::Unit(n+1, n); } - // page 12_13 // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M -template -void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) -{ +template +void BDCSVD::deflation43(Index firstCol, Index shift, Index i, + Index size) { using std::abs; using std::sqrt; using std::pow; @@ -1117,28 +1234,28 @@ void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, RealScalar c = m_computed(start, start); RealScalar s = m_computed(start+i, start); RealScalar r = numext::hypot(c,s); - if (r == Literal(0)) + if (numext::is_exactly_zero(r)) { m_computed(start+i, start+i) = Literal(0); return; } - m_computed(start,start) = r; + m_computed(start,start) = r; m_computed(start+i, start) = Literal(0); m_computed(start+i, start+i) = Literal(0); - + JacobiRotation J(c/r,-s/r); if (m_compU) m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J); else m_naiveU.applyOnTheRight(firstCol, firstCol+i, J); -}// end deflation 43 - +} // end deflation 43 // page 13 // i,j >= 1, i!=j and |di - dj| < epsilon * norm2(M) // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested -template -void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) -{ +template +void BDCSVD::deflation44(Index firstColu, Index firstColm, Index firstRowW, + Index firstColW, Index i, Index j, + Index size) { using std::abs; using std::sqrt; using std::conj; @@ -1157,7 +1274,7 @@ void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index first << m_computed(firstColm + i+1, firstColm+i+1) << " " << m_computed(firstColm + i+2, firstColm+i+2) << "\n"; #endif - if (r==Literal(0)) + if (numext::is_exactly_zero(r)) { m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j); return; @@ -1172,39 +1289,38 @@ void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index first if (m_compU) m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J); else m_naiveU.applyOnTheRight(firstColu+i, firstColu+j, J); if (m_compV) m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + i, firstColW + j, J); -}// end deflation 44 - +} // end deflation 44 // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] -template -void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) -{ +template +void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, + Index firstRowW, Index firstColW, Index shift) { using std::sqrt; using std::abs; const Index length = lastCol + 1 - firstCol; - + Block col0(m_computed, firstCol+shift, firstCol+shift, length, 1); Diagonal fulldiag(m_computed); VectorBlock,Dynamic> diag(fulldiag, firstCol+shift, length); - + const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits::epsilon() * maxDiag); RealScalar epsilon_coarse = Literal(8) * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(m_naiveU.allFinite()); - assert(m_naiveV.allFinite()); - assert(m_computed.allFinite()); + eigen_internal_assert(m_naiveU.allFinite()); + eigen_internal_assert(m_naiveV.allFinite()); + eigen_internal_assert(m_computed.allFinite()); #endif -#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "\ndeflate:" << diag.head(k+1).transpose() << " | " << diag.segment(k+1,length-k-1).transpose() << "\n"; #endif - + //condition 4.1 if (diag(0) < epsilon_coarse) - { + { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n"; #endif @@ -1232,31 +1348,31 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, } #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(m_naiveU.allFinite()); - assert(m_naiveV.allFinite()); - assert(m_computed.allFinite()); + eigen_internal_assert(m_naiveU.allFinite()); + eigen_internal_assert(m_naiveV.allFinite()); + eigen_internal_assert(m_computed.allFinite()); #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "to be sorted: " << diag.transpose() << "\n\n"; std::cout << " : " << col0.transpose() << "\n\n"; #endif { - // Check for total deflation - // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting - bool total_deflation = (col0.tail(length-1).array()::deflation(Eigen::Index firstCol, Eigen::Index lastCol, else permutation[p] = i++; } } - + // If we have a total deflation, then we have to insert diag(0) at the right place if(total_deflation) { @@ -1282,22 +1398,22 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, } } } - + // Current index of each col, and current column of each index Index *realInd = m_workspaceI.data()+length; Index *realCol = m_workspaceI.data()+2*length; - + for(int pos = 0; pos< length; pos++) { realCol[pos] = pos; realInd[pos] = pos; } - + for(Index i = total_deflation?0:1; i < length; i++) { const Index pi = permutation[length - (total_deflation ? i+1 : i)]; const Index J = realCol[pi]; - + using std::swap; // swap diagonal and first column entries: swap(diag(i), diag(J)); @@ -1320,7 +1436,7 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n"; std::cout << " : " << col0.transpose() << "\n\n"; #endif - + //condition 4.4 { Index i = length-1; @@ -1335,18 +1451,18 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length); } } - + #ifdef EIGEN_BDCSVD_SANITY_CHECKS for(Index j=2;j::deflation(Eigen::Index firstCol, Eigen::Index lastCol, * * \sa class BDCSVD */ -template -BDCSVD::PlainObject> -MatrixBase::bdcSvd(unsigned int computationOptions) const -{ - return BDCSVD(*this, computationOptions); +template +template +BDCSVD::PlainObject, Options> MatrixBase::bdcSvd() const { + return BDCSVD(*this); +} + +/** \svd_module + * + * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm + * + * \sa class BDCSVD + */ +template +template +BDCSVD::PlainObject, Options> MatrixBase::bdcSvd( + unsigned int computationOptions) const { + return BDCSVD(*this, computationOptions); } } // end namespace Eigen diff --git a/libs/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h b/libs/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h new file mode 100644 index 0000000..d4cc173 --- /dev/null +++ b/libs/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h @@ -0,0 +1,163 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2022 Melven Roehrig-Zoellner +// Copyright (c) 2011, Intel Corporation. All rights reserved. +// +// This file is based on the JacobiSVD_LAPACKE.h originally from Intel - +// see license notice below: +/* + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to LAPACKe + * Singular Value Decomposition - SVD (divide and conquer variant) + ******************************************************************************** +*/ +#ifndef EIGEN_BDCSVD_LAPACKE_H +#define EIGEN_BDCSVD_LAPACKE_H + +namespace Eigen { + +namespace internal { + +namespace lapacke_helpers { + +/** \internal Specialization for the data types supported by LAPACKe */ + +// defining a derived class to allow access to protected members +template +class BDCSVD_LAPACKE : public BDCSVD { + typedef BDCSVD SVD; + typedef typename SVD::MatrixType MatrixType; + typedef typename SVD::Scalar Scalar; + typedef typename SVD::RealScalar RealScalar; + +public: + // construct this by moving from a parent object + BDCSVD_LAPACKE(SVD&& svd) : SVD(std::move(svd)) {} + + void compute_impl_lapacke(const MatrixType& matrix, unsigned int computationOptions) { + + SVD::allocate(matrix.rows(), matrix.cols(), computationOptions); + + SVD::m_nonzeroSingularValues = SVD::m_diagSize; + + // prepare arguments to ?gesdd + const lapack_int matrix_order = lapack_storage_of(matrix); + const char jobz = (SVD::m_computeFullU || SVD::m_computeFullV) ? 'A' : (SVD::m_computeThinU || SVD::m_computeThinV) ? 'S' : 'N'; + const lapack_int u_cols = (jobz == 'A') ? to_lapack(SVD::m_rows) : (jobz == 'S') ? to_lapack(SVD::m_diagSize) : 1; + const lapack_int vt_rows = (jobz == 'A') ? to_lapack(SVD::m_cols) : (jobz == 'S') ? to_lapack(SVD::m_diagSize) : 1; + lapack_int ldu, ldvt; + Scalar *u, *vt, dummy; + MatrixType localU; + if (SVD::computeU() && !(SVD::m_computeThinU && SVD::m_computeFullV) ) { + ldu = to_lapack(SVD::m_matrixU.outerStride()); + u = SVD::m_matrixU.data(); + } else if (SVD::computeV()) { + localU.resize(SVD::m_rows, u_cols); + ldu = to_lapack(localU.outerStride()); + u = localU.data(); + } else { ldu=1; u=&dummy; } + MatrixType localV; + if (SVD::computeU() || SVD::computeV()) { + localV.resize(vt_rows, SVD::m_cols); + ldvt = to_lapack(localV.outerStride()); + vt = localV.data(); + } else { ldvt=1; vt=&dummy; } + MatrixType temp; temp = matrix; + + // actual call to ?gesdd + lapack_int info = gesdd( matrix_order, jobz, to_lapack(SVD::m_rows), to_lapack(SVD::m_cols), + to_lapack(temp.data()), to_lapack(temp.outerStride()), (RealScalar*)SVD::m_singularValues.data(), + to_lapack(u), ldu, to_lapack(vt), ldvt); + + // Check the result of the LAPACK call + if (info < 0 || !SVD::m_singularValues.allFinite()) { + // this includes info == -4 => NaN entry in A + SVD::m_info = InvalidInput; + } else if (info > 0 ) { + SVD::m_info = NoConvergence; + } else { + SVD::m_info = Success; + if (SVD::m_computeThinU && SVD::m_computeFullV) { + SVD::m_matrixU = localU.leftCols(SVD::m_matrixU.cols()); + } + if (SVD::computeV()) { + SVD::m_matrixV = localV.adjoint().leftCols(SVD::m_matrixV.cols()); + } + } + SVD::m_isInitialized = true; + } +}; + +template +BDCSVD& BDCSVD_wrapper(BDCSVD& svd, const MatrixType_& matrix, int computationOptions) +{ + // we need to move to the wrapper type and back + BDCSVD_LAPACKE tmpSvd(std::move(svd)); + tmpSvd.compute_impl_lapacke(matrix, computationOptions); + svd = std::move(tmpSvd); + return svd; +} + +} // end namespace lapacke_helpers + +} // end namespace internal + +#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS) \ +template<> inline \ +BDCSVD, OPTIONS>& \ +BDCSVD, OPTIONS>::compute_impl(const Matrix& matrix, unsigned int computationOptions) {\ + return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions); \ +} + +#define EIGEN_LAPACK_SDD_OPTIONS(OPTIONS) \ + EIGEN_LAPACKE_SDD(double, ColMajor, OPTIONS) \ + EIGEN_LAPACKE_SDD(float, ColMajor, OPTIONS) \ + EIGEN_LAPACKE_SDD(dcomplex, ColMajor, OPTIONS) \ + EIGEN_LAPACKE_SDD(scomplex, ColMajor, OPTIONS) \ +\ + EIGEN_LAPACKE_SDD(double, RowMajor, OPTIONS) \ + EIGEN_LAPACKE_SDD(float, RowMajor, OPTIONS) \ + EIGEN_LAPACKE_SDD(dcomplex, RowMajor, OPTIONS) \ + EIGEN_LAPACKE_SDD(scomplex, RowMajor, OPTIONS) + +EIGEN_LAPACK_SDD_OPTIONS(0) +EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU) +EIGEN_LAPACK_SDD_OPTIONS(ComputeThinV) +EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU) +EIGEN_LAPACK_SDD_OPTIONS(ComputeFullV) +EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU | ComputeThinV) +EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU | ComputeFullV) +EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU | ComputeFullV) +EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU | ComputeThinV) + +#undef EIGEN_LAPACK_SDD_OPTIONS + +#undef EIGEN_LAPACKE_SDD + +} // end namespace Eigen + +#endif // EIGEN_BDCSVD_LAPACKE_H diff --git a/libs/eigen/Eigen/src/SVD/InternalHeaderCheck.h b/libs/eigen/Eigen/src/SVD/InternalHeaderCheck.h new file mode 100644 index 0000000..fa67b96 --- /dev/null +++ b/libs/eigen/Eigen/src/SVD/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_SVD_MODULE_H +#error "Please include Eigen/SVD instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SVD/JacobiSVD.h b/libs/eigen/Eigen/src/SVD/JacobiSVD.h index 9d95acd..d7dc209 100644 --- a/libs/eigen/Eigen/src/SVD/JacobiSVD.h +++ b/libs/eigen/Eigen/src/SVD/JacobiSVD.h @@ -11,13 +11,15 @@ #ifndef EIGEN_JACOBISVD_H #define EIGEN_JACOBISVD_H -namespace Eigen { +#include "./InternalHeaderCheck.h" + +namespace Eigen { namespace internal { + // forward declaration (needed by ICC) // the empty body is required by MSVC -template::IsComplex> +template ::IsComplex> struct svd_precondition_2x2_block_to_be_real {}; /*** QR preconditioners (R-SVD) @@ -44,47 +46,40 @@ struct qr_preconditioner_should_do_anything }; }; -template::ret -> struct qr_preconditioner_impl {}; +template ::ret> +struct qr_preconditioner_impl {}; -template -class qr_preconditioner_impl -{ -public: - void allocate(const JacobiSVD&) {} - bool run(JacobiSVD&, const MatrixType&) - { - return false; - } +template +class qr_preconditioner_impl { + public: + void allocate(const JacobiSVD&) {} + bool run(JacobiSVD&, const MatrixType&) { return false; } }; /*** preconditioner using FullPivHouseholderQR ***/ -template -class qr_preconditioner_impl -{ -public: +template +class qr_preconditioner_impl { + public: typedef typename MatrixType::Scalar Scalar; - enum - { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime - }; - typedef Matrix WorkspaceType; + typedef JacobiSVD SVDType; - void allocate(const JacobiSVD& svd) - { + enum { WorkspaceSize = MatrixType::RowsAtCompileTime, MaxWorkspaceSize = MatrixType::MaxRowsAtCompileTime }; + + typedef Matrix WorkspaceType; + + void allocate(const SVDType& svd) { if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols()) { - m_qr.~QRType(); - ::new (&m_qr) QRType(svd.rows(), svd.cols()); + internal::destroy_at(&m_qr); + internal::construct_at(&m_qr, svd.rows(), svd.cols()); } if (svd.m_computeFullU) m_workspace.resize(svd.rows()); } - bool run(JacobiSVD& svd, const MatrixType& matrix) - { + bool run(SVDType& svd, const MatrixType& matrix) { if(matrix.rows() > matrix.cols()) { m_qr.compute(matrix); @@ -95,43 +90,43 @@ public: } return false; } + private: typedef FullPivHouseholderQR QRType; QRType m_qr; WorkspaceType m_workspace; }; -template -class qr_preconditioner_impl -{ -public: +template +class qr_preconditioner_impl { + public: typedef typename MatrixType::Scalar Scalar; - enum - { + typedef JacobiSVD SVDType; + + enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + MatrixOptions = MatrixType::Options }; - typedef typename internal::make_proper_matrix_type< - Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime - >::type TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type::type + TransposeTypeWithSameStorageOrder; - void allocate(const JacobiSVD& svd) - { + void allocate(const SVDType& svd) { if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols()) { - m_qr.~QRType(); - ::new (&m_qr) QRType(svd.cols(), svd.rows()); + internal::destroy_at(&m_qr); + internal::construct_at(&m_qr, svd.cols(), svd.rows()); } m_adjoint.resize(svd.cols(), svd.rows()); if (svd.m_computeFullV) m_workspace.resize(svd.cols()); } - bool run(JacobiSVD& svd, const MatrixType& matrix) - { + bool run(SVDType& svd, const MatrixType& matrix) { if(matrix.cols() > matrix.rows()) { m_adjoint = matrix.adjoint(); @@ -143,32 +138,41 @@ public: } else return false; } + private: typedef FullPivHouseholderQR QRType; QRType m_qr; TransposeTypeWithSameStorageOrder m_adjoint; - typename internal::plain_row_type::type m_workspace; + typename plain_row_type::type m_workspace; }; /*** preconditioner using ColPivHouseholderQR ***/ -template -class qr_preconditioner_impl -{ -public: - void allocate(const JacobiSVD& svd) - { +template +class qr_preconditioner_impl { + public: + typedef typename MatrixType::Scalar Scalar; + typedef JacobiSVD SVDType; + + enum { + WorkspaceSize = internal::traits::MatrixUColsAtCompileTime, + MaxWorkspaceSize = internal::traits::MatrixUMaxColsAtCompileTime + }; + + typedef Matrix WorkspaceType; + + void allocate(const SVDType& svd) { if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols()) { - m_qr.~QRType(); - ::new (&m_qr) QRType(svd.rows(), svd.cols()); + internal::destroy_at(&m_qr); + internal::construct_at(&m_qr, svd.rows(), svd.cols()); } if (svd.m_computeFullU) m_workspace.resize(svd.rows()); else if (svd.m_computeThinU) m_workspace.resize(svd.cols()); } - bool run(JacobiSVD& svd, const MatrixType& matrix) - { + bool run(SVDType& svd, const MatrixType& matrix) { if(matrix.rows() > matrix.cols()) { m_qr.compute(matrix); @@ -188,41 +192,44 @@ public: private: typedef ColPivHouseholderQR QRType; QRType m_qr; - typename internal::plain_col_type::type m_workspace; + WorkspaceType m_workspace; }; -template -class qr_preconditioner_impl -{ -public: +template +class qr_preconditioner_impl { + public: typedef typename MatrixType::Scalar Scalar; - enum - { + typedef JacobiSVD SVDType; + + enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + MatrixOptions = MatrixType::Options, + WorkspaceSize = internal::traits::MatrixVColsAtCompileTime, + MaxWorkspaceSize = internal::traits::MatrixVMaxColsAtCompileTime }; - typedef typename internal::make_proper_matrix_type< - Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime - >::type TransposeTypeWithSameStorageOrder; + typedef Matrix WorkspaceType; - void allocate(const JacobiSVD& svd) - { + typedef typename internal::make_proper_matrix_type::type + TransposeTypeWithSameStorageOrder; + + void allocate(const SVDType& svd) { if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols()) { - m_qr.~QRType(); - ::new (&m_qr) QRType(svd.cols(), svd.rows()); + internal::destroy_at(&m_qr); + internal::construct_at(&m_qr, svd.cols(), svd.rows()); } if (svd.m_computeFullV) m_workspace.resize(svd.cols()); else if (svd.m_computeThinV) m_workspace.resize(svd.rows()); m_adjoint.resize(svd.cols(), svd.rows()); } - bool run(JacobiSVD& svd, const MatrixType& matrix) - { + bool run(SVDType& svd, const MatrixType& matrix) { if(matrix.cols() > matrix.rows()) { m_adjoint = matrix.adjoint(); @@ -245,28 +252,35 @@ private: typedef ColPivHouseholderQR QRType; QRType m_qr; TransposeTypeWithSameStorageOrder m_adjoint; - typename internal::plain_row_type::type m_workspace; + WorkspaceType m_workspace; }; /*** preconditioner using HouseholderQR ***/ -template -class qr_preconditioner_impl -{ -public: - void allocate(const JacobiSVD& svd) - { +template +class qr_preconditioner_impl { + public: + typedef typename MatrixType::Scalar Scalar; + typedef JacobiSVD SVDType; + + enum { + WorkspaceSize = internal::traits::MatrixUColsAtCompileTime, + MaxWorkspaceSize = internal::traits::MatrixUMaxColsAtCompileTime + }; + + typedef Matrix WorkspaceType; + + void allocate(const SVDType& svd) { if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols()) { - m_qr.~QRType(); - ::new (&m_qr) QRType(svd.rows(), svd.cols()); + internal::destroy_at(&m_qr); + internal::construct_at(&m_qr, svd.rows(), svd.cols()); } if (svd.m_computeFullU) m_workspace.resize(svd.rows()); else if (svd.m_computeThinU) m_workspace.resize(svd.cols()); } - bool run(JacobiSVD& svd, const MatrixType& matrix) - { + bool run(SVDType& svd, const MatrixType& matrix) { if(matrix.rows() > matrix.cols()) { m_qr.compute(matrix); @@ -282,44 +296,47 @@ public: } return false; } + private: typedef HouseholderQR QRType; QRType m_qr; - typename internal::plain_col_type::type m_workspace; + WorkspaceType m_workspace; }; -template -class qr_preconditioner_impl -{ -public: +template +class qr_preconditioner_impl { + public: typedef typename MatrixType::Scalar Scalar; - enum - { + typedef JacobiSVD SVDType; + + enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + MatrixOptions = MatrixType::Options, + WorkspaceSize = internal::traits::MatrixVColsAtCompileTime, + MaxWorkspaceSize = internal::traits::MatrixVMaxColsAtCompileTime }; - typedef typename internal::make_proper_matrix_type< - Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime - >::type TransposeTypeWithSameStorageOrder; + typedef Matrix WorkspaceType; - void allocate(const JacobiSVD& svd) - { + typedef typename internal::make_proper_matrix_type::type + TransposeTypeWithSameStorageOrder; + + void allocate(const SVDType& svd) { if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols()) { - m_qr.~QRType(); - ::new (&m_qr) QRType(svd.cols(), svd.rows()); + internal::destroy_at(&m_qr); + internal::construct_at(&m_qr, svd.cols(), svd.rows()); } if (svd.m_computeFullV) m_workspace.resize(svd.cols()); else if (svd.m_computeThinV) m_workspace.resize(svd.rows()); m_adjoint.resize(svd.cols(), svd.rows()); } - bool run(JacobiSVD& svd, const MatrixType& matrix) - { + bool run(SVDType& svd, const MatrixType& matrix) { if(matrix.cols() > matrix.rows()) { m_adjoint = matrix.adjoint(); @@ -342,7 +359,7 @@ private: typedef HouseholderQR QRType; QRType m_qr; TransposeTypeWithSameStorageOrder m_adjoint; - typename internal::plain_row_type::type m_workspace; + WorkspaceType m_workspace; }; /*** 2x2 SVD implementation @@ -350,18 +367,16 @@ private: *** JacobiSVD consists in performing a series of 2x2 SVD subproblems ***/ -template -struct svd_precondition_2x2_block_to_be_real -{ - typedef JacobiSVD SVD; +template +struct svd_precondition_2x2_block_to_be_real { + typedef JacobiSVD SVD; typedef typename MatrixType::RealScalar RealScalar; static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; } }; -template -struct svd_precondition_2x2_block_to_be_real -{ - typedef JacobiSVD SVD; +template +struct svd_precondition_2x2_block_to_be_real { + typedef JacobiSVD SVD; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) @@ -375,7 +390,7 @@ struct svd_precondition_2x2_block_to_be_real const RealScalar considerAsZero = (std::numeric_limits::min)(); const RealScalar precision = NumTraits::epsilon(); - if(n==0) + if(numext::is_exactly_zero(n)) { // make sure first column is zero work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0); @@ -423,249 +438,258 @@ struct svd_precondition_2x2_block_to_be_real } }; -template -struct traits > - : traits<_MatrixType> -{ - typedef _MatrixType MatrixType; +template +struct traits > : svd_traits { + typedef MatrixType_ MatrixType; }; } // end namespace internal /** \ingroup SVD_Module - * - * - * \class JacobiSVD - * - * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix - * - * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition - * \tparam QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally - * for the R-SVD step for non-square matrices. See discussion of possible values below. - * - * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product - * \f[ A = U S V^* \f] - * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal; - * the diagonal entries of S are known as the \em singular \em values of \a A and the columns of \a U and \a V are known as the left - * and right \em singular \em vectors of \a A respectively. - * - * Singular values are always sorted in decreasing order. - * - * This JacobiSVD decomposition computes only the singular values by default. If you want \a U or \a V, you need to ask for them explicitly. - * - * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the - * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual - * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, - * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving. - * - * Here's an example demonstrating basic usage: - * \include JacobiSVD_basic.cpp - * Output: \verbinclude JacobiSVD_basic.out - * - * This JacobiSVD class is a two-sided Jacobi R-SVD decomposition, ensuring optimal reliability and accuracy. The downside is that it's slower than - * bidiagonalizing SVD algorithms for large square matrices; however its complexity is still \f$ O(n^2p) \f$ where \a n is the smaller dimension and - * \a p is the greater dimension, meaning that it is still of the same order of complexity as the faster bidiagonalizing R-SVD algorithms. - * In particular, like any R-SVD, it takes advantage of non-squareness in that its complexity is only linear in the greater dimension. - * - * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to - * terminate in finite (and reasonable) time. - * - * The possible values for QRPreconditioner are: - * \li ColPivHouseholderQRPreconditioner is the default. In practice it's very safe. It uses column-pivoting QR. - * \li FullPivHouseholderQRPreconditioner, is the safest and slowest. It uses full-pivoting QR. - * Contrary to other QRs, it doesn't allow computing thin unitaries. - * \li HouseholderQRPreconditioner is the fastest, and less safe and accurate than the pivoting variants. It uses non-pivoting QR. - * This is very similar in safety and accuracy to the bidiagonalization process used by bidiagonalizing SVD algorithms (since bidiagonalization - * is inherently non-pivoting). However the resulting SVD is still more reliable than bidiagonalizing SVDs because the Jacobi-based iterarive - * process is more reliable than the optimized bidiagonal SVD iterations. - * \li NoQRPreconditioner allows not to use a QR preconditioner at all. This is useful if you know that you will only be computing - * JacobiSVD decompositions of square matrices. Non-square matrices require a QR preconditioner. Using this option will result in - * faster compilation and smaller executable code. It won't significantly speed up computation, since JacobiSVD is always checking - * if QR preconditioning is needed before applying it anyway. - * - * \sa MatrixBase::jacobiSvd() - */ -template class JacobiSVD - : public SVDBase > -{ - typedef SVDBase Base; - public: + * + * + * \class JacobiSVD + * + * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix + * + * \tparam MatrixType_ the type of the matrix of which we are computing the SVD decomposition + * \tparam Options this optional parameter allows one to specify the type of QR decomposition that will be used + * internally for the R-SVD step for non-square matrices. Additionally, it allows one to specify whether to compute thin + * or full unitaries \a U and \a V. See discussion of possible values below. + * + * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product + * \f[ A = U S V^* \f] + * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero + * outside of its main diagonal; the diagonal entries of S are known as the \em singular \em values of \a A and the + * columns of \a U and \a V are known as the left and right \em singular \em vectors of \a A respectively. + * + * Singular values are always sorted in decreasing order. + * + * This JacobiSVD decomposition computes only the singular values by default. If you want \a U or \a V, you need to ask + * for them explicitly. + * + * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p + * matrix, letting \a m be the smaller value among \a n and \a p, there are only \a m singular vectors; the remaining + * columns of \a U and \a V do not correspond to actual singular vectors. Asking for \em thin \a U or \a V means asking + * for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, and \a V is then a p-by-m matrix. + * Notice that thin \a U and \a V are all you need for (least squares) solving. + * + * Here's an example demonstrating basic usage: + * \include JacobiSVD_basic.cpp + * Output: \verbinclude JacobiSVD_basic.out + * + * This JacobiSVD class is a two-sided Jacobi R-SVD decomposition, ensuring optimal reliability and accuracy. The + * downside is that it's slower than bidiagonalizing SVD algorithms for large square matrices; however its complexity is + * still \f$ O(n^2p) \f$ where \a n is the smaller dimension and \a p is the greater dimension, meaning that it is still + * of the same order of complexity as the faster bidiagonalizing R-SVD algorithms. In particular, like any R-SVD, it + * takes advantage of non-squareness in that its complexity is only linear in the greater dimension. + * + * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is + * guaranteed to terminate in finite (and reasonable) time. + * + * The possible QR preconditioners that can be set with Options template parameter are: + * \li ColPivHouseholderQRPreconditioner is the default. In practice it's very safe. It uses column-pivoting QR. + * \li FullPivHouseholderQRPreconditioner, is the safest and slowest. It uses full-pivoting QR. + * Contrary to other QRs, it doesn't allow computing thin unitaries. + * \li HouseholderQRPreconditioner is the fastest, and less safe and accurate than the pivoting variants. It uses + * non-pivoting QR. This is very similar in safety and accuracy to the bidiagonalization process used by bidiagonalizing + * SVD algorithms (since bidiagonalization is inherently non-pivoting). However the resulting SVD is still more reliable + * than bidiagonalizing SVDs because the Jacobi-based iterarive process is more reliable than the optimized bidiagonal + * SVD iterations. \li NoQRPreconditioner allows not to use a QR preconditioner at all. This is useful if you know that + * you will only be computing JacobiSVD decompositions of square matrices. Non-square matrices require a QR + * preconditioner. Using this option will result in faster compilation and smaller executable code. It won't + * significantly speed up computation, since JacobiSVD is always checking if QR preconditioning is needed before + * applying it anyway. + * + * One may also use the Options template parameter to specify how the unitaries should be computed. The options are + * #ComputeThinU, #ComputeThinV, #ComputeFullU, #ComputeFullV. It is not possible to request both the thin and full + * versions of a unitary. By default, unitaries will not be computed. + * + * You can set the QRPreconditioner and unitary options together: JacobiSVD + * + * \sa MatrixBase::jacobiSvd() + */ +template +class JacobiSVD : public SVDBase > { + typedef SVDBase Base; - typedef _MatrixType MatrixType; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, - DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime), - MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, - MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime), - MatrixOptions = MatrixType::Options - }; + public: + typedef MatrixType_ MatrixType; + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; + typedef typename Base::Index Index; + enum { + Options = Options_, + QRPreconditioner = internal::get_qr_preconditioner(Options), + RowsAtCompileTime = Base::RowsAtCompileTime, + ColsAtCompileTime = Base::ColsAtCompileTime, + DiagSizeAtCompileTime = Base::DiagSizeAtCompileTime, + MaxRowsAtCompileTime = Base::MaxRowsAtCompileTime, + MaxColsAtCompileTime = Base::MaxColsAtCompileTime, + MaxDiagSizeAtCompileTime = Base::MaxDiagSizeAtCompileTime, + MatrixOptions = Base::MatrixOptions + }; - typedef typename Base::MatrixUType MatrixUType; - typedef typename Base::MatrixVType MatrixVType; - typedef typename Base::SingularValuesType SingularValuesType; - - typedef typename internal::plain_row_type::type RowType; - typedef typename internal::plain_col_type::type ColType; - typedef Matrix - WorkMatrixType; + typedef typename Base::MatrixUType MatrixUType; + typedef typename Base::MatrixVType MatrixVType; + typedef typename Base::SingularValuesType SingularValuesType; + typedef Matrix + WorkMatrixType; - /** \brief Default Constructor. - * - * The default constructor is useful in cases in which the user intends to - * perform decompositions via JacobiSVD::compute(const MatrixType&). - */ - JacobiSVD() - {} + /** \brief Default Constructor. + * + * The default constructor is useful in cases in which the user intends to + * perform decompositions via JacobiSVD::compute(const MatrixType&). + */ + JacobiSVD() {} + /** \brief Default Constructor with memory preallocation + * + * Like the default constructor but with preallocation of the internal data + * according to the specified problem size and \a Options template parameter. + * + * \sa JacobiSVD() + */ + JacobiSVD(Index rows, Index cols) { allocate(rows, cols, internal::get_computation_options(Options)); } - /** \brief Default Constructor with memory preallocation - * - * Like the default constructor but with preallocation of the internal data - * according to the specified problem size. - * \sa JacobiSVD() - */ - JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0) - { - allocate(rows, cols, computationOptions); - } + /** \brief Default Constructor with memory preallocation + * + * Like the default constructor but with preallocation of the internal data + * according to the specified problem size. + * + * One \b cannot request unitaries using both the \a Options template parameter + * and the constructor. If possible, prefer using the \a Options template parameter. + * + * \param computationOptions specify whether to compute Thin/Full unitaries U/V + * \sa JacobiSVD() + * + * \deprecated Will be removed in the next major Eigen version. Options should + * be specified in the \a Options template parameter. + */ + EIGEN_DEPRECATED + JacobiSVD(Index rows, Index cols, unsigned int computationOptions) { + internal::check_svd_options_assertions(computationOptions, rows, cols); + allocate(rows, cols, computationOptions); + } - /** \brief Constructor performing the decomposition of given matrix. - * - * \param matrix the matrix to decompose - * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed. - * By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU, - * #ComputeFullV, #ComputeThinV. - * - * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not - * available with the (non-default) FullPivHouseholderQR preconditioner. - */ - explicit JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0) - { - compute(matrix, computationOptions); - } + /** \brief Constructor performing the decomposition of given matrix, using the custom options specified + * with the \a Options template paramter. + * + * \param matrix the matrix to decompose + */ + explicit JacobiSVD(const MatrixType& matrix) { compute_impl(matrix, internal::get_computation_options(Options)); } - /** \brief Method performing the decomposition of given matrix using custom options. - * - * \param matrix the matrix to decompose - * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed. - * By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU, - * #ComputeFullV, #ComputeThinV. - * - * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not - * available with the (non-default) FullPivHouseholderQR preconditioner. - */ - JacobiSVD& compute(const MatrixType& matrix, unsigned int computationOptions); + /** \brief Constructor performing the decomposition of given matrix using specified options + * for computing unitaries. + * + * One \b cannot request unitiaries using both the \a Options template parameter + * and the constructor. If possible, prefer using the \a Options template parameter. + * + * \param matrix the matrix to decompose + * \param computationOptions specify whether to compute Thin/Full unitaries U/V + * + * \deprecated Will be removed in the next major Eigen version. Options should + * be specified in the \a Options template parameter. + */ + // EIGEN_DEPRECATED // TODO(cantonios): re-enable after fixing a few 3p libraries that error on deprecation warnings. + JacobiSVD(const MatrixType& matrix, unsigned int computationOptions) { + internal::check_svd_options_assertions(computationOptions, matrix.rows(), matrix.cols()); + compute_impl(matrix, computationOptions); + } - /** \brief Method performing the decomposition of given matrix using current options. - * - * \param matrix the matrix to decompose - * - * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int). - */ - JacobiSVD& compute(const MatrixType& matrix) - { - return compute(matrix, m_computationOptions); - } + /** \brief Method performing the decomposition of given matrix. Computes Thin/Full unitaries U/V if specified + * using the \a Options template parameter or the class constructor. + * + * \param matrix the matrix to decompose + */ + JacobiSVD& compute(const MatrixType& matrix) { return compute_impl(matrix, m_computationOptions); } - using Base::computeU; - using Base::computeV; - using Base::rows; - using Base::cols; - using Base::rank; + /** \brief Method performing the decomposition of given matrix, as specified by + * the `computationOptions` parameter. + * + * \param matrix the matrix to decompose + * \param computationOptions specify whether to compute Thin/Full unitaries U/V + * + * \deprecated Will be removed in the next major Eigen version. Options should + * be specified in the \a Options template parameter. + */ + EIGEN_DEPRECATED + JacobiSVD& compute(const MatrixType& matrix, unsigned int computationOptions) { + internal::check_svd_options_assertions(m_computationOptions, matrix.rows(), matrix.cols()); + return compute_impl(matrix, computationOptions); + } - private: - void allocate(Index rows, Index cols, unsigned int computationOptions); + using Base::computeU; + using Base::computeV; + using Base::rows; + using Base::cols; + using Base::rank; - protected: - using Base::m_matrixU; - using Base::m_matrixV; - using Base::m_singularValues; - using Base::m_info; - using Base::m_isInitialized; - using Base::m_isAllocated; - using Base::m_usePrescribedThreshold; - using Base::m_computeFullU; - using Base::m_computeThinU; - using Base::m_computeFullV; - using Base::m_computeThinV; - using Base::m_computationOptions; - using Base::m_nonzeroSingularValues; - using Base::m_rows; - using Base::m_cols; - using Base::m_diagSize; - using Base::m_prescribedThreshold; - WorkMatrixType m_workMatrix; + private: + void allocate(Index rows, Index cols, unsigned int computationOptions); + JacobiSVD& compute_impl(const MatrixType& matrix, unsigned int computationOptions); - template - friend struct internal::svd_precondition_2x2_block_to_be_real; - template - friend struct internal::qr_preconditioner_impl; + protected: + using Base::m_cols; + using Base::m_computationOptions; + using Base::m_computeFullU; + using Base::m_computeFullV; + using Base::m_computeThinU; + using Base::m_computeThinV; + using Base::m_diagSize; + using Base::m_info; + using Base::m_isAllocated; + using Base::m_isInitialized; + using Base::m_matrixU; + using Base::m_matrixV; + using Base::m_nonzeroSingularValues; + using Base::m_prescribedThreshold; + using Base::m_rows; + using Base::m_singularValues; + using Base::m_usePrescribedThreshold; + using Base::ShouldComputeThinU; + using Base::ShouldComputeThinV; - internal::qr_preconditioner_impl m_qr_precond_morecols; - internal::qr_preconditioner_impl m_qr_precond_morerows; - MatrixType m_scaledMatrix; + EIGEN_STATIC_ASSERT(!(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)) && + !(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)), + "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. " + "Use the ColPivHouseholderQR preconditioner instead.") + + template + friend struct internal::svd_precondition_2x2_block_to_be_real; + template + friend struct internal::qr_preconditioner_impl; + + internal::qr_preconditioner_impl + m_qr_precond_morecols; + internal::qr_preconditioner_impl + m_qr_precond_morerows; + WorkMatrixType m_workMatrix; + MatrixType m_scaledMatrix; }; -template -void JacobiSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) -{ - eigen_assert(rows >= 0 && cols >= 0); +template +void JacobiSVD::allocate(Index rows, Index cols, unsigned int computationOptions) { + if (Base::allocate(rows, cols, computationOptions)) return; - if (m_isAllocated && - rows == m_rows && - cols == m_cols && - computationOptions == m_computationOptions) - { - return; - } + eigen_assert(!(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)) && + !(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)) && + "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. " + "Use the ColPivHouseholderQR preconditioner instead."); - m_rows = rows; - m_cols = cols; - m_info = Success; - m_isInitialized = false; - m_isAllocated = true; - m_computationOptions = computationOptions; - m_computeFullU = (computationOptions & ComputeFullU) != 0; - m_computeThinU = (computationOptions & ComputeThinU) != 0; - m_computeFullV = (computationOptions & ComputeFullV) != 0; - m_computeThinV = (computationOptions & ComputeThinV) != 0; - eigen_assert(!(m_computeFullU && m_computeThinU) && "JacobiSVD: you can't ask for both full and thin U"); - eigen_assert(!(m_computeFullV && m_computeThinV) && "JacobiSVD: you can't ask for both full and thin V"); - eigen_assert(EIGEN_IMPLIES(m_computeThinU || m_computeThinV, MatrixType::ColsAtCompileTime==Dynamic) && - "JacobiSVD: thin U and V are only available when your matrix has a dynamic number of columns."); - if (QRPreconditioner == FullPivHouseholderQRPreconditioner) - { - eigen_assert(!(m_computeThinU || m_computeThinV) && - "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. " - "Use the ColPivHouseholderQR preconditioner instead."); - } - m_diagSize = (std::min)(m_rows, m_cols); - m_singularValues.resize(m_diagSize); - if(RowsAtCompileTime==Dynamic) - m_matrixU.resize(m_rows, m_computeFullU ? m_rows - : m_computeThinU ? m_diagSize - : 0); - if(ColsAtCompileTime==Dynamic) - m_matrixV.resize(m_cols, m_computeFullV ? m_cols - : m_computeThinV ? m_diagSize - : 0); m_workMatrix.resize(m_diagSize, m_diagSize); - if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); if(m_rows!=m_cols) m_scaledMatrix.resize(rows,cols); } -template -JacobiSVD& -JacobiSVD::compute(const MatrixType& matrix, unsigned int computationOptions) -{ +template +JacobiSVD& JacobiSVD::compute_impl(const MatrixType& matrix, + unsigned int computationOptions) { using std::abs; + allocate(matrix.rows(), matrix.cols(), computationOptions); // currently we stop when we reach precision 2*epsilon as the last bit of precision can require an unreasonable number of iterations, @@ -682,7 +706,7 @@ JacobiSVD::compute(const MatrixType& matrix, unsig m_info = InvalidInput; return *this; } - if(scale==RealScalar(0)) scale = RealScalar(1); + if(numext::is_exactly_zero(scale)) scale = RealScalar(1); /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ @@ -724,8 +748,8 @@ JacobiSVD::compute(const MatrixType& matrix, unsig finished = false; // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal // the complex to real operation returns true if the updated 2x2 block is not already diagonal - if(internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q, maxDiagEntry)) - { + if (internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q, + maxDiagEntry)) { JacobiRotation j_left, j_right; internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); @@ -775,7 +799,7 @@ JacobiSVD::compute(const MatrixType& matrix, unsig { Index pos; RealScalar maxRemainingSingularValue = m_singularValues.tail(m_diagSize-i).maxCoeff(&pos); - if(maxRemainingSingularValue == RealScalar(0)) + if(numext::is_exactly_zero(maxRemainingSingularValue)) { m_nonzeroSingularValues = i; break; @@ -800,13 +824,19 @@ JacobiSVD::compute(const MatrixType& matrix, unsig * * \sa class JacobiSVD */ -template -JacobiSVD::PlainObject> -MatrixBase::jacobiSvd(unsigned int computationOptions) const -{ - return JacobiSVD(*this, computationOptions); +template +template +JacobiSVD::PlainObject, Options> MatrixBase::jacobiSvd() const { + return JacobiSVD(*this); } -} // end namespace Eigen +template +template +JacobiSVD::PlainObject, Options> MatrixBase::jacobiSvd( + unsigned int computationOptions) const { + return JacobiSVD(*this, computationOptions); +} + +} // end namespace Eigen #endif // EIGEN_JACOBISVD_H diff --git a/libs/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/libs/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h index ff0516f..93244cd 100644 --- a/libs/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +++ b/libs/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h @@ -33,14 +33,17 @@ #ifndef EIGEN_JACOBISVD_LAPACKE_H #define EIGEN_JACOBISVD_LAPACKE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ -#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \ +#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS) \ template<> inline \ -JacobiSVD, ColPivHouseholderQRPreconditioner>& \ -JacobiSVD, ColPivHouseholderQRPreconditioner>::compute(const Matrix& matrix, unsigned int computationOptions) \ +JacobiSVD, OPTIONS>& \ +JacobiSVD, OPTIONS>::compute_impl(const Matrix& matrix, \ + unsigned int computationOptions) \ { \ typedef Matrix MatrixType; \ /*typedef MatrixType::Scalar Scalar;*/ \ @@ -69,22 +72,41 @@ JacobiSVD, ColPiv } else { ldvt=1; vt=&dummy; }\ Matrix superb; superb.resize(m_diagSize, 1); \ MatrixType m_temp; m_temp = matrix; \ - LAPACKE_##LAPACKE_PREFIX##gesvd( matrix_order, jobu, jobvt, internal::convert_index(m_rows), internal::convert_index(m_cols), (LAPACKE_TYPE*)m_temp.data(), lda, (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \ - if (computeV()) m_matrixV = localV.adjoint(); \ + lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd( matrix_order, jobu, jobvt, internal::convert_index(m_rows), internal::convert_index(m_cols), (LAPACKE_TYPE*)m_temp.data(), lda, (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \ + /* Check the result of the LAPACK call */ \ + if (info < 0 || !m_singularValues.allFinite()) { \ + m_info = InvalidInput; \ + } else if (info > 0 ) { \ + m_info = NoConvergence; \ + } else { \ + m_info = Success; \ + if (computeV()) m_matrixV = localV.adjoint(); \ + } \ /* for(int i=0;i +void check_svd_options_assertions(unsigned int computationOptions, Index rows, Index cols) { + EIGEN_STATIC_ASSERT((Options & ComputationOptionsBits) == 0, + "SVDBase: Cannot request U or V using both static and runtime options, even if they match. " + "Requesting unitaries at runtime is DEPRECATED: " + "Prefer requesting unitaries statically, using the Options template parameter."); + eigen_assert(!(should_svd_compute_thin_u(computationOptions) && cols < rows && MatrixType::RowsAtCompileTime != Dynamic) && + !(should_svd_compute_thin_v(computationOptions) && rows < cols && MatrixType::ColsAtCompileTime != Dynamic) && + "SVDBase: If thin U is requested at runtime, your matrix must have more rows than columns or a dynamic number of rows." + "Similarly, if thin V is requested at runtime, you matrix must have more columns than rows or a dynamic number of columns."); + (void)computationOptions; + (void)rows; + (void)cols; +} + template struct traits > : traits { @@ -27,6 +60,29 @@ template struct traits > typedef int StorageIndex; enum { Flags = 0 }; }; + +template +struct svd_traits : traits { + static constexpr int Options = Options_; + static constexpr bool ShouldComputeFullU = internal::should_svd_compute_full_u(Options); + static constexpr bool ShouldComputeThinU = internal::should_svd_compute_thin_u(Options); + static constexpr bool ShouldComputeFullV = internal::should_svd_compute_full_v(Options); + static constexpr bool ShouldComputeThinV = internal::should_svd_compute_thin_v(Options); + enum { + DiagSizeAtCompileTime = + internal::min_size_prefer_dynamic(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime), + MaxDiagSizeAtCompileTime = + internal::min_size_prefer_dynamic(MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime), + MatrixUColsAtCompileTime = ShouldComputeThinU ? DiagSizeAtCompileTime + : MatrixType::RowsAtCompileTime, + MatrixVColsAtCompileTime = ShouldComputeThinV ? DiagSizeAtCompileTime + : MatrixType::ColsAtCompileTime, + MatrixUMaxColsAtCompileTime = ShouldComputeThinU ? MaxDiagSizeAtCompileTime + : MatrixType::MaxRowsAtCompileTime, + MatrixVMaxColsAtCompileTime = ShouldComputeThinV ? MaxDiagSizeAtCompileTime + : MatrixType::MaxColsAtCompileTime + }; +}; } /** \ingroup SVD_Module @@ -52,7 +108,7 @@ template struct traits > * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving. * - * The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not + * The status of the computation can be retrieved using the \a info() method. Unless \a info() returns \a Success, the results should be not * considered well defined. * * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to @@ -72,20 +128,38 @@ public: typedef typename NumTraits::Real RealScalar; typedef typename Eigen::internal::traits::StorageIndex StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 + + static constexpr bool ShouldComputeFullU = internal::traits::ShouldComputeFullU; + static constexpr bool ShouldComputeThinU = internal::traits::ShouldComputeThinU; + static constexpr bool ShouldComputeFullV = internal::traits::ShouldComputeFullV; + static constexpr bool ShouldComputeThinV = internal::traits::ShouldComputeThinV; + enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, - DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime), + DiagSizeAtCompileTime = internal::min_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime), MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime), - MatrixOptions = MatrixType::Options + MaxDiagSizeAtCompileTime = internal::min_size_prefer_fixed(MaxRowsAtCompileTime, MaxColsAtCompileTime), + MatrixOptions = MatrixType::Options, + MatrixUColsAtCompileTime = internal::traits::MatrixUColsAtCompileTime, + MatrixVColsAtCompileTime = internal::traits::MatrixVColsAtCompileTime, + MatrixUMaxColsAtCompileTime = internal::traits::MatrixUMaxColsAtCompileTime, + MatrixVMaxColsAtCompileTime = internal::traits::MatrixVMaxColsAtCompileTime }; - typedef Matrix MatrixUType; - typedef Matrix MatrixVType; + EIGEN_STATIC_ASSERT(!(ShouldComputeFullU && ShouldComputeThinU), "SVDBase: Cannot request both full and thin U") + EIGEN_STATIC_ASSERT(!(ShouldComputeFullV && ShouldComputeThinV), "SVDBase: Cannot request both full and thin V") + + typedef + typename internal::make_proper_matrix_type::type MatrixUType; + typedef + typename internal::make_proper_matrix_type::type MatrixVType; + typedef typename internal::plain_diag_type::type SingularValuesType; - + Derived& derived() { return *static_cast(this); } const Derived& derived() const { return *static_cast(this); } @@ -249,10 +323,7 @@ public: protected: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); - } + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) void _check_compute_assertions() const { eigen_assert(m_isInitialized && "SVD is not initialized."); @@ -267,7 +338,7 @@ protected: } // return true if already allocated - bool allocate(Index rows, Index cols, unsigned int computationOptions) ; + bool allocate(Index rows, Index cols, unsigned int computationOptions); MatrixUType m_matrixU; MatrixVType m_matrixV; @@ -285,21 +356,18 @@ protected: * Default constructor of SVDBase */ SVDBase() - : m_info(Success), - m_isInitialized(false), - m_isAllocated(false), - m_usePrescribedThreshold(false), - m_computeFullU(false), - m_computeThinU(false), - m_computeFullV(false), - m_computeThinV(false), - m_computationOptions(0), - m_rows(-1), m_cols(-1), m_diagSize(0) - { - check_template_parameters(); - } - - + : m_info(Success), + m_isInitialized(false), + m_isAllocated(false), + m_usePrescribedThreshold(false), + m_computeFullU(false), + m_computeThinU(false), + m_computeFullV(false), + m_computeThinV(false), + m_computationOptions(0), + m_rows(-1), + m_cols(-1), + m_diagSize(0) {} }; #ifndef EIGEN_PARSED_BY_DOXYGEN @@ -333,9 +401,8 @@ void SVDBase::_solve_impl_transposed(const RhsType &rhs, DstType &dst) } #endif -template -bool SVDBase::allocate(Index rows, Index cols, unsigned int computationOptions) -{ +template +bool SVDBase::allocate(Index rows, Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0); if (m_isAllocated && @@ -352,14 +419,13 @@ bool SVDBase::allocate(Index rows, Index cols, unsigned int computat m_isInitialized = false; m_isAllocated = true; m_computationOptions = computationOptions; - m_computeFullU = (computationOptions & ComputeFullU) != 0; - m_computeThinU = (computationOptions & ComputeThinU) != 0; - m_computeFullV = (computationOptions & ComputeFullV) != 0; - m_computeThinV = (computationOptions & ComputeThinV) != 0; + m_computeFullU = ShouldComputeFullU || internal::should_svd_compute_full_u(computationOptions); + m_computeThinU = ShouldComputeThinU || internal::should_svd_compute_thin_u(computationOptions); + m_computeFullV = ShouldComputeFullV || internal::should_svd_compute_full_v(computationOptions); + m_computeThinV = ShouldComputeThinV || internal::should_svd_compute_thin_v(computationOptions); + eigen_assert(!(m_computeFullU && m_computeThinU) && "SVDBase: you can't ask for both full and thin U"); eigen_assert(!(m_computeFullV && m_computeThinV) && "SVDBase: you can't ask for both full and thin V"); - eigen_assert(EIGEN_IMPLIES(m_computeThinU || m_computeThinV, MatrixType::ColsAtCompileTime==Dynamic) && - "SVDBase: thin U and V are only available when your matrix has a dynamic number of columns."); m_diagSize = (std::min)(m_rows, m_cols); m_singularValues.resize(m_diagSize); diff --git a/libs/eigen/Eigen/src/SVD/UpperBidiagonalization.h b/libs/eigen/Eigen/src/SVD/UpperBidiagonalization.h index 997defc..e6c9097 100644 --- a/libs/eigen/Eigen/src/SVD/UpperBidiagonalization.h +++ b/libs/eigen/Eigen/src/SVD/UpperBidiagonalization.h @@ -11,17 +11,19 @@ #ifndef EIGEN_BIDIAGONALIZATION_H #define EIGEN_BIDIAGONALIZATION_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { // UpperBidiagonalization will probably be replaced by a Bidiagonalization class, don't want to make it stable API. // At the same time, it's useful to keep for now as it's about the only thing that is testing the BandMatrix class. -template class UpperBidiagonalization +template class UpperBidiagonalization { public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -37,10 +39,10 @@ template class UpperBidiagonalization typedef Matrix SuperDiagVectorType; typedef HouseholderSequence< const MatrixType, - const typename internal::remove_all::ConjugateReturnType>::type + const internal::remove_all_t::ConjugateReturnType> > HouseholderUSequenceType; typedef HouseholderSequence< - const typename internal::remove_all::type, + const internal::remove_all_t, Diagonal, OnTheRight > HouseholderVSequenceType; @@ -51,7 +53,7 @@ template class UpperBidiagonalization * The default constructor is useful in cases in which the user intends to * perform decompositions via Bidiagonalization::compute(const MatrixType&). */ - UpperBidiagonalization() : m_householder(), m_bidiagonal(), m_isInitialized(false) {} + UpperBidiagonalization() : m_householder(), m_bidiagonal(0, 0), m_isInitialized(false) {} explicit UpperBidiagonalization(const MatrixType& matrix) : m_householder(matrix.rows(), matrix.cols()), @@ -60,7 +62,13 @@ template class UpperBidiagonalization { compute(matrix); } - + + UpperBidiagonalization(Index rows, Index cols) + : m_householder(rows, cols), + m_bidiagonal(cols, cols), + m_isInitialized(false) + {} + UpperBidiagonalization& compute(const MatrixType& matrix); UpperBidiagonalization& computeUnblocked(const MatrixType& matrix); @@ -161,13 +169,13 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename NumTraits::Literal Literal; - enum { StorageOrder = traits::Flags & RowMajorBit }; - typedef InnerStride ColInnerStride; - typedef InnerStride RowInnerStride; + static constexpr int StorageOrder = (traits::Flags & RowMajorBit) ? RowMajor : ColMajor; + typedef InnerStride ColInnerStride; + typedef InnerStride RowInnerStride; typedef Ref, 0, ColInnerStride> SubColumnType; typedef Ref, 0, RowInnerStride> SubRowType; typedef Ref > SubMatType; - + Index brows = A.rows(); Index bcols = A.cols(); @@ -293,7 +301,7 @@ void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagona Index size = (std::min)(rows, cols); // X and Y are work space - enum { StorageOrder = traits::Flags & RowMajorBit }; + static constexpr int StorageOrder = (traits::Flags & RowMajorBit) ? RowMajor : ColMajor; Matrix -UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::computeUnblocked(const _MatrixType& matrix) +template +UpperBidiagonalization& UpperBidiagonalization::computeUnblocked(const MatrixType_& matrix) { Index rows = matrix.rows(); Index cols = matrix.cols(); @@ -377,8 +385,8 @@ UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::comput return *this; } -template -UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix) +template +UpperBidiagonalization& UpperBidiagonalization::compute(const MatrixType_& matrix) { Index rows = matrix.rows(); Index cols = matrix.cols(); diff --git a/libs/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h b/libs/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h new file mode 100644 index 0000000..f8d8762 --- /dev/null +++ b/libs/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_SPARSECHOLESKY_MODULE_H +#error "Please include Eigen/SparseCholesky instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h b/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h index 9f93e32..d90ca13 100644 --- a/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SIMPLICIAL_CHOLESKY_H #define EIGEN_SIMPLICIAL_CHOLESKY_H +#include "./InternalHeaderCheck.h" + namespace Eigen { enum SimplicialCholeskyMode { @@ -271,17 +273,17 @@ class SimplicialCholeskyBase : public SparseSolverBase RealScalar m_shiftScale; }; -template > class SimplicialLLT; -template > class SimplicialLDLT; -template > class SimplicialCholesky; +template > class SimplicialLLT; +template > class SimplicialLDLT; +template > class SimplicialCholesky; namespace internal { -template struct traits > +template struct traits > { - typedef _MatrixType MatrixType; - typedef _Ordering OrderingType; - enum { UpLo = _UpLo }; + typedef MatrixType_ MatrixType; + typedef Ordering_ OrderingType; + enum { UpLo = UpLo_ }; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef SparseMatrix CholMatrixType; @@ -291,11 +293,11 @@ template struct traits struct traits > +template struct traits > { - typedef _MatrixType MatrixType; - typedef _Ordering OrderingType; - enum { UpLo = _UpLo }; + typedef MatrixType_ MatrixType; + typedef Ordering_ OrderingType; + enum { UpLo = UpLo_ }; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef SparseMatrix CholMatrixType; @@ -305,11 +307,11 @@ template struct traits struct traits > +template struct traits > { - typedef _MatrixType MatrixType; - typedef _Ordering OrderingType; - enum { UpLo = _UpLo }; + typedef MatrixType_ MatrixType; + typedef Ordering_ OrderingType; + enum { UpLo = UpLo_ }; }; } @@ -325,21 +327,21 @@ template struct traits - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. - * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<> + * \tparam Ordering_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<> * * \implsparsesolverconcept * * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering */ -template - class SimplicialLLT : public SimplicialCholeskyBase > +template + class SimplicialLLT : public SimplicialCholeskyBase > { public: - typedef _MatrixType MatrixType; - enum { UpLo = _UpLo }; + typedef MatrixType_ MatrixType; + enum { UpLo = UpLo_ }; typedef SimplicialCholeskyBase Base; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; @@ -416,21 +418,21 @@ public: * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization * such that the factorized matrix is P A P^-1. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower * or Upper. Default is Lower. - * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<> + * \tparam Ordering_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<> * * \implsparsesolverconcept * * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering */ -template - class SimplicialLDLT : public SimplicialCholeskyBase > +template + class SimplicialLDLT : public SimplicialCholeskyBase > { public: - typedef _MatrixType MatrixType; - enum { UpLo = _UpLo }; + typedef MatrixType_ MatrixType; + enum { UpLo = UpLo_ }; typedef SimplicialCholeskyBase Base; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; @@ -507,12 +509,12 @@ public: * * \sa class SimplicialLDLT, class SimplicialLLT */ -template - class SimplicialCholesky : public SimplicialCholeskyBase > +template + class SimplicialCholesky : public SimplicialCholeskyBase > { public: - typedef _MatrixType MatrixType; - enum { UpLo = _UpLo }; + typedef MatrixType_ MatrixType; + enum { UpLo = UpLo_ }; typedef SimplicialCholeskyBase Base; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; diff --git a/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 72e1740..3106c9b 100644 --- a/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/libs/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -20,6 +20,8 @@ the Mozilla Public License v. 2.0, as stated at the top of this file. #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template diff --git a/libs/eigen/Eigen/src/SparseCore/AmbiVector.h b/libs/eigen/Eigen/src/SparseCore/AmbiVector.h index 2cb7747..594e91d 100644 --- a/libs/eigen/Eigen/src/SparseCore/AmbiVector.h +++ b/libs/eigen/Eigen/src/SparseCore/AmbiVector.h @@ -10,6 +10,8 @@ #ifndef EIGEN_AMBIVECTOR_H #define EIGEN_AMBIVECTOR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -19,12 +21,12 @@ namespace internal { * * See BasicSparseLLT and SparseProduct for usage examples. */ -template +template class AmbiVector { public: - typedef _Scalar Scalar; - typedef _StorageIndex StorageIndex; + typedef Scalar_ Scalar; + typedef StorageIndex_ StorageIndex; typedef typename NumTraits::Real RealScalar; explicit AmbiVector(Index size) @@ -125,8 +127,8 @@ class AmbiVector }; /** \returns the number of non zeros in the current sub vector */ -template -Index AmbiVector<_Scalar,_StorageIndex>::nonZeros() const +template +Index AmbiVector::nonZeros() const { if (m_mode==IsSparse) return m_llSize; @@ -134,8 +136,8 @@ Index AmbiVector<_Scalar,_StorageIndex>::nonZeros() const return m_end - m_start; } -template -void AmbiVector<_Scalar,_StorageIndex>::init(double estimatedDensity) +template +void AmbiVector::init(double estimatedDensity) { if (estimatedDensity>0.1) init(IsDense); @@ -143,8 +145,8 @@ void AmbiVector<_Scalar,_StorageIndex>::init(double estimatedDensity) init(IsSparse); } -template -void AmbiVector<_Scalar,_StorageIndex>::init(int mode) +template +void AmbiVector::init(int mode) { m_mode = mode; // This is only necessary in sparse mode, but we set these unconditionally to avoid some maybe-uninitialized warnings @@ -160,15 +162,15 @@ void AmbiVector<_Scalar,_StorageIndex>::init(int mode) * * Don't worry, this function is extremely cheap. */ -template -void AmbiVector<_Scalar,_StorageIndex>::restart() +template +void AmbiVector::restart() { m_llCurrent = m_llStart; } /** Set all coefficients of current subvector to zero */ -template -void AmbiVector<_Scalar,_StorageIndex>::setZero() +template +void AmbiVector::setZero() { if (m_mode==IsDense) { @@ -183,8 +185,8 @@ void AmbiVector<_Scalar,_StorageIndex>::setZero() } } -template -_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeffRef(Index i) +template +Scalar_& AmbiVector::coeffRef(Index i) { if (m_mode==IsDense) return m_buffer[i]; @@ -252,8 +254,8 @@ _Scalar& AmbiVector<_Scalar,_StorageIndex>::coeffRef(Index i) } } -template -_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeff(Index i) +template +Scalar_& AmbiVector::coeff(Index i) { if (m_mode==IsDense) return m_buffer[i]; @@ -280,11 +282,11 @@ _Scalar& AmbiVector<_Scalar,_StorageIndex>::coeff(Index i) } /** Iterator over the nonzero coefficients */ -template -class AmbiVector<_Scalar,_StorageIndex>::Iterator +template +class AmbiVector::Iterator { public: - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef typename NumTraits::Real RealScalar; /** Default constructor diff --git a/libs/eigen/Eigen/src/SparseCore/CompressedStorage.h b/libs/eigen/Eigen/src/SparseCore/CompressedStorage.h index acd986f..733b1aa 100644 --- a/libs/eigen/Eigen/src/SparseCore/CompressedStorage.h +++ b/libs/eigen/Eigen/src/SparseCore/CompressedStorage.h @@ -10,6 +10,8 @@ #ifndef EIGEN_COMPRESSED_STORAGE_H #define EIGEN_COMPRESSED_STORAGE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -18,13 +20,13 @@ namespace internal { * Stores a sparse set of values as a list of values and a list of indices. * */ -template +template class CompressedStorage { public: - typedef _Scalar Scalar; - typedef _StorageIndex StorageIndex; + typedef Scalar_ Scalar; + typedef StorageIndex_ StorageIndex; protected: @@ -69,8 +71,8 @@ class CompressedStorage ~CompressedStorage() { - delete[] m_values; - delete[] m_indices; + conditional_aligned_delete_auto(m_values, m_allocatedSize); + conditional_aligned_delete_auto(m_indices, m_allocatedSize); } void reserve(Index size) @@ -178,24 +180,13 @@ class CompressedStorage { if (m_allocatedSize newValues(m_allocatedSize); - internal::scoped_array newIndices(m_allocatedSize); - - // copy first chunk - internal::smart_copy(m_values, m_values +id, newValues.ptr()); - internal::smart_copy(m_indices, m_indices+id, newIndices.ptr()); - - // copy the rest - if(m_size>id) - { - internal::smart_copy(m_values +id, m_values +m_size, newValues.ptr() +id+1); - internal::smart_copy(m_indices+id, m_indices+m_size, newIndices.ptr()+id+1); - } - std::swap(m_values,newValues.ptr()); - std::swap(m_indices,newIndices.ptr()); + Index newAllocatedSize = 2 * (m_size + 1); + m_values = conditional_aligned_realloc_new_auto(m_values, newAllocatedSize, m_allocatedSize); + m_indices = + conditional_aligned_realloc_new_auto(m_indices, newAllocatedSize, m_allocatedSize); + m_allocatedSize = newAllocatedSize; } - else if(m_size>id) + if(m_size>id) { internal::smart_memmove(m_values +id, m_values +m_size, m_values +id+1); internal::smart_memmove(m_indices+id, m_indices+m_size, m_indices+id+1); @@ -223,22 +214,6 @@ class CompressedStorage } } - void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) - { - Index k = 0; - Index n = size(); - for (Index i=0; i newValues(size); - internal::scoped_array newIndices(size); - Index copySize = (std::min)(size, m_size); - if (copySize>0) { - internal::smart_copy(m_values, m_values+copySize, newValues.ptr()); - internal::smart_copy(m_indices, m_indices+copySize, newIndices.ptr()); - } - std::swap(m_values,newValues.ptr()); - std::swap(m_indices,newIndices.ptr()); + m_values = conditional_aligned_realloc_new_auto(m_values, size, m_allocatedSize); + m_indices = conditional_aligned_realloc_new_auto(m_indices, size, m_allocatedSize); m_allocatedSize = size; } diff --git a/libs/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/libs/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 9486502..f852493 100644 --- a/libs/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/libs/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -10,6 +10,8 @@ #ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H #define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -17,9 +19,9 @@ namespace internal { template static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false) { - typedef typename remove_all::type::Scalar LhsScalar; - typedef typename remove_all::type::Scalar RhsScalar; - typedef typename remove_all::type::Scalar ResScalar; + typedef typename remove_all_t::Scalar LhsScalar; + typedef typename remove_all_t::Scalar RhsScalar; + typedef typename remove_all_t::Scalar ResScalar; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); @@ -124,6 +126,11 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r namespace internal { + +// Helper template to generate new sparse matrix types +template +using WithStorageOrder = SparseMatrix; + template::Flags&RowMajorBit) ? RowMajor : ColMajor, int RhsStorageOrder = (traits::Flags&RowMajorBit) ? RowMajor : ColMajor, @@ -133,20 +140,20 @@ struct conservative_sparse_sparse_product_selector; template struct conservative_sparse_sparse_product_selector { - typedef typename remove_all::type LhsCleaned; + typedef remove_all_t LhsCleaned; typedef typename LhsCleaned::Scalar Scalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix RowMajorMatrix; - typedef SparseMatrix ColMajorMatrixAux; - typedef typename sparse_eval::type ColMajorMatrix; + using RowMajorMatrix = WithStorageOrder; + using ColMajorMatrixAux = WithStorageOrder; // If the result is tall and thin (in the extreme case a column vector) // then it is faster to sort the coefficients inplace instead of transposing twice. // FIXME, the following heuristic is probably not very good. if(lhs.rows()>rhs.cols()) { + using ColMajorMatrix = typename sparse_eval::type; ColMajorMatrix resCol(lhs.rows(),rhs.cols()); // perform sorted insertion internal::conservative_sparse_sparse_product_impl(lhs, rhs, resCol, true); @@ -168,8 +175,8 @@ struct conservative_sparse_sparse_product_selector RowMajorRhs; - typedef SparseMatrix RowMajorRes; + using RowMajorRhs = WithStorageOrder; + using RowMajorRes = WithStorageOrder; RowMajorRhs rhsRow = rhs; RowMajorRes resRow(lhs.rows(), rhs.cols()); internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); @@ -182,8 +189,8 @@ struct conservative_sparse_sparse_product_selector RowMajorLhs; - typedef SparseMatrix RowMajorRes; + using RowMajorLhs = WithStorageOrder; + using RowMajorRes = WithStorageOrder; RowMajorLhs lhsRow = lhs; RowMajorRes resRow(lhs.rows(), rhs.cols()); internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); @@ -196,9 +203,9 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhs, lhs, resRow); + using RowMajorRes = WithStorageOrder; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhs, lhs, resRow); res = resRow; } }; @@ -207,13 +214,13 @@ struct conservative_sparse_sparse_product_selector struct conservative_sparse_sparse_product_selector { - typedef typename traits::type>::Scalar Scalar; + typedef typename traits>::Scalar Scalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef SparseMatrix ColMajorMatrix; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhs, rhs, resCol); + using ColMajorRes = WithStorageOrder; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhs, rhs, resCol); res = resCol; } }; @@ -223,8 +230,8 @@ struct conservative_sparse_sparse_product_selector ColMajorLhs; - typedef SparseMatrix ColMajorRes; + using ColMajorLhs = WithStorageOrder; + using ColMajorRes = WithStorageOrder; ColMajorLhs lhsCol = lhs; ColMajorRes resCol(lhs.rows(), rhs.cols()); internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); @@ -237,8 +244,8 @@ struct conservative_sparse_sparse_product_selector ColMajorRhs; - typedef SparseMatrix ColMajorRes; + using ColMajorRhs = WithStorageOrder; + using ColMajorRes = WithStorageOrder; ColMajorRhs rhsCol = rhs; ColMajorRes resCol(lhs.rows(), rhs.cols()); internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); @@ -251,12 +258,12 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - typedef SparseMatrix ColMajorMatrix; - RowMajorMatrix resRow(lhs.rows(),rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhs, lhs, resRow); + using ColMajorRes = WithStorageOrder; + using RowMajorRes = WithStorageOrder; + RowMajorRes resRow(lhs.rows(),rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhs, lhs, resRow); // sort the non zeros: - ColMajorMatrix resCol(resRow); + ColMajorRes resCol(resRow); res = resCol; } }; @@ -269,8 +276,8 @@ namespace internal { template static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef typename remove_all::type::Scalar LhsScalar; - typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all_t::Scalar LhsScalar; + typedef typename remove_all_t::Scalar RhsScalar; Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); @@ -317,7 +324,7 @@ struct sparse_sparse_to_dense_product_selector ColMajorLhs; + using ColMajorLhs = WithStorageOrder; ColMajorLhs lhsCol(lhs); internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); } @@ -328,7 +335,7 @@ struct sparse_sparse_to_dense_product_selector ColMajorRhs; + using ColMajorRhs = WithStorageOrder; ColMajorRhs rhsCol(rhs); internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); } diff --git a/libs/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h b/libs/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h new file mode 100644 index 0000000..9de5936 --- /dev/null +++ b/libs/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_SPARSECORE_MODULE_H +#error "Please include Eigen/SparseCore instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h b/libs/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h deleted file mode 100644 index 67718c8..0000000 --- a/libs/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +++ /dev/null @@ -1,67 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2014 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_MAPPED_SPARSEMATRIX_H -#define EIGEN_MAPPED_SPARSEMATRIX_H - -namespace Eigen { - -/** \deprecated Use Map > - * \class MappedSparseMatrix - * - * \brief Sparse matrix - * - * \param _Scalar the scalar type, i.e. the type of the coefficients - * - * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme. - * - */ -namespace internal { -template -struct traits > : traits > -{}; -} // end namespace internal - -template -class MappedSparseMatrix - : public Map > -{ - typedef Map > Base; - - public: - - typedef typename Base::StorageIndex StorageIndex; - typedef typename Base::Scalar Scalar; - - inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0) - : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr) - {} - - /** Empty destructor */ - inline ~MappedSparseMatrix() {} -}; - -namespace internal { - -template -struct evaluator > - : evaluator > > -{ - typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType; - typedef evaluator > Base; - - evaluator() : Base() {} - explicit evaluator(const XprType &mat) : Base(mat) {} -}; - -} - -} // end namespace Eigen - -#endif // EIGEN_MAPPED_SPARSEMATRIX_H diff --git a/libs/eigen/Eigen/src/SparseCore/SparseAssign.h b/libs/eigen/Eigen/src/SparseCore/SparseAssign.h index 905485c..29f6af4 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseAssign.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseAssign.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEASSIGN_H #define EIGEN_SPARSEASSIGN_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template @@ -78,12 +80,18 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) const bool transpose = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit); const Index outerEvaluationSize = (SrcEvaluatorType::Flags&RowMajorBit) ? src.rows() : src.cols(); + + Index reserveSize = 0; + for (Index j = 0; j < outerEvaluationSize; ++j) + for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it) + reserveSize++; + if ((!transpose) && src.isRValue()) { // eval without temporary dst.resize(src.rows(), src.cols()); dst.setZero(); - dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2)); + dst.reserve(reserveSize); for (Index j=0; j dense1 = dense2; dense1 += sparse; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::Shape,DenseShape>::value>::type + std::enable_if_t::Shape,DenseShape>::value> run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, const internal::assign_op& /*func*/) { @@ -188,7 +196,7 @@ struct assignment_from_dense_op_sparse // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::Shape,DenseShape>::value>::type + std::enable_if_t::Shape,DenseShape>::value> run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, const internal::assign_op& /*func*/) { @@ -206,8 +214,8 @@ struct assignment_from_dense_op_sparse template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> \ struct Assignment, const Lhs, const Rhs>, internal::ASSIGN_OP, \ Sparse2Dense, \ - typename internal::enable_if< internal::is_same::Shape,DenseShape>::value \ - || internal::is_same::Shape,DenseShape>::value>::type> \ + std::enable_if_t< internal::is_same::Shape,DenseShape>::value \ + || internal::is_same::Shape,DenseShape>::value>> \ : assignment_from_dense_op_sparse, internal::ASSIGN_OP2 > \ {} diff --git a/libs/eigen/Eigen/src/SparseCore/SparseBlock.h b/libs/eigen/Eigen/src/SparseCore/SparseBlock.h index 5b4f6cc..b3fc859 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseBlock.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseBlock.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_BLOCK_H #define EIGEN_SPARSE_BLOCK_H +#include "./InternalHeaderCheck.h" + namespace Eigen { // Subset of columns or rows @@ -17,7 +19,7 @@ template class BlockImpl : public SparseMatrixBase > { - typedef typename internal::remove_all::type _MatrixTypeNested; + typedef internal::remove_all_t MatrixTypeNested_; typedef Block BlockType; public: enum { IsRowMajor = internal::traits::IsRowMajor }; @@ -96,7 +98,7 @@ template class sparse_matrix_block_impl : public SparseCompressedBase > { - typedef typename internal::remove_all::type _MatrixTypeNested; + typedef internal::remove_all_t MatrixTypeNested_; typedef Block BlockType; typedef SparseCompressedBase > Base; using Base::convert_index; @@ -119,8 +121,8 @@ public: template inline BlockType& operator=(const SparseMatrixBase& other) { - typedef typename internal::remove_all::type _NestedMatrixType; - _NestedMatrixType& matrix = m_matrix; + typedef internal::remove_all_t NestedMatrixType_; + NestedMatrixType_& matrix = m_matrix; // This assignment is slow if this vector set is not empty // and/or it is not at the end of the nonzeros of the underlying matrix. @@ -283,13 +285,13 @@ public: } // namespace internal -template -class BlockImpl,BlockRows,BlockCols,true,Sparse> - : public internal::sparse_matrix_block_impl,BlockRows,BlockCols> +template +class BlockImpl,BlockRows,BlockCols,true,Sparse> + : public internal::sparse_matrix_block_impl,BlockRows,BlockCols> { public: - typedef _StorageIndex StorageIndex; - typedef SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType; + typedef StorageIndex_ StorageIndex; + typedef SparseMatrix SparseMatrixType; typedef internal::sparse_matrix_block_impl Base; inline BlockImpl(SparseMatrixType& xpr, Index i) : Base(xpr, i) @@ -302,13 +304,13 @@ public: using Base::operator=; }; -template -class BlockImpl,BlockRows,BlockCols,true,Sparse> - : public internal::sparse_matrix_block_impl,BlockRows,BlockCols> +template +class BlockImpl,BlockRows,BlockCols,true,Sparse> + : public internal::sparse_matrix_block_impl,BlockRows,BlockCols> { public: - typedef _StorageIndex StorageIndex; - typedef const SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType; + typedef StorageIndex_ StorageIndex; + typedef const SparseMatrix SparseMatrixType; typedef internal::sparse_matrix_block_impl Base; inline BlockImpl(SparseMatrixType& xpr, Index i) : Base(xpr, i) @@ -340,7 +342,7 @@ public: enum { IsRowMajor = internal::traits::IsRowMajor }; EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType) - typedef typename internal::remove_all::type _MatrixTypeNested; + typedef internal::remove_all_t MatrixTypeNested_; /** Column or Row constructor */ @@ -429,17 +431,12 @@ struct unary_evaluator, IteratorBa enum { IsRowMajor = XprType::IsRowMajor, - - OuterVector = (BlockCols==1 && ArgType::IsRowMajor) - | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&". - // revert to || as soon as not needed anymore. - (BlockRows==1 && !ArgType::IsRowMajor), - + OuterVector = (BlockCols == 1 && ArgType::IsRowMajor) || (BlockRows == 1 && !ArgType::IsRowMajor), CoeffReadCost = evaluator::CoeffReadCost, Flags = XprType::Flags }; - typedef typename internal::conditional::type InnerIterator; + typedef std::conditional_t InnerIterator; explicit unary_evaluator(const XprType& op) : m_argImpl(op.nestedExpression()), m_block(op) @@ -467,7 +464,7 @@ template class unary_evaluator, IteratorBased>::InnerVectorInnerIterator : public EvalIterator { - // NOTE MSVC fails to compile if we don't explicitely "import" IsRowMajor from unary_evaluator + // NOTE MSVC fails to compile if we don't explicitly "import" IsRowMajor from unary_evaluator // because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786) // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor enum { XprIsRowMajor = unary_evaluator::IsRowMajor }; @@ -533,8 +530,8 @@ public: while(++m_outerPos -struct unary_evaluator,BlockRows,BlockCols,true>, IteratorBased> - : evaluator,BlockRows,BlockCols,true> > > +template +struct unary_evaluator,BlockRows,BlockCols,true>, IteratorBased> + : evaluator,BlockRows,BlockCols,true> > > { - typedef Block,BlockRows,BlockCols,true> XprType; + typedef Block,BlockRows,BlockCols,true> XprType; typedef evaluator > Base; explicit unary_evaluator(const XprType &xpr) : Base(xpr) {} }; -template -struct unary_evaluator,BlockRows,BlockCols,true>, IteratorBased> - : evaluator,BlockRows,BlockCols,true> > > +template +struct unary_evaluator,BlockRows,BlockCols,true>, IteratorBased> + : evaluator,BlockRows,BlockCols,true> > > { - typedef Block,BlockRows,BlockCols,true> XprType; + typedef Block,BlockRows,BlockCols,true> XprType; typedef evaluator > Base; explicit unary_evaluator(const XprType &xpr) : Base(xpr) {} }; diff --git a/libs/eigen/Eigen/src/SparseCore/SparseColEtree.h b/libs/eigen/Eigen/src/SparseCore/SparseColEtree.h index ebe02d1..ff32458 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseColEtree.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseColEtree.h @@ -31,6 +31,8 @@ #ifndef SPARSE_COLETREE_H #define SPARSE_COLETREE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseCore/SparseCompressedBase.h b/libs/eigen/Eigen/src/SparseCore/SparseCompressedBase.h index 6a2c7a8..243cd16 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseCompressedBase.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_COMPRESSED_BASE_H #define EIGEN_SPARSE_COMPRESSED_BASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template class SparseCompressedBase; @@ -20,6 +22,9 @@ template struct traits > : traits {}; +template +struct inner_sort_impl; + } // end namespace internal /** \ingroup SparseCore_Module @@ -124,6 +129,40 @@ class SparseCompressedBase * * \sa valuePtr(), isCompressed() */ Map > coeffs() { eigen_assert(isCompressed()); return Array::Map(valuePtr(),nonZeros()); } + + /** sorts the inner vectors in the range [begin,end) with respect to `Comp` + * \sa innerIndicesAreSorted() */ + template > + inline void sortInnerIndices(Index begin, Index end) { + eigen_assert(begin >= 0 && end <= derived().outerSize() && end >= begin); + internal::inner_sort_impl::run(*this, begin, end); + } + + /** \returns the index of the first inner vector in the range [begin,end) that is not sorted with respect to `Comp`, or `end` if the range is fully sorted + * \sa sortInnerIndices() */ + template > + inline Index innerIndicesAreSorted(Index begin, Index end) const { + eigen_assert(begin >= 0 && end <= derived().outerSize() && end >= begin); + return internal::inner_sort_impl::check(*this, begin, end); + } + + /** sorts the inner vectors in the range [0,outerSize) with respect to `Comp` + * \sa innerIndicesAreSorted() */ + template > + inline void sortInnerIndices() { + Index begin = 0; + Index end = derived().outerSize(); + internal::inner_sort_impl::run(*this, begin, end); + } + + /** \returns the index of the first inner vector in the range [0,outerSize) that is not sorted with respect to `Comp`, or `outerSize` if the range is fully sorted + * \sa sortInnerIndices() */ + template> + inline Index innerIndicesAreSorted() const { + Index begin = 0; + Index end = derived().outerSize(); + return internal::inner_sort_impl::check(*this, begin, end); + } protected: /** Default constructor. Do nothing. */ @@ -194,8 +233,7 @@ class SparseCompressedBase::InnerIterator } } - explicit InnerIterator(const SparseCompressedBase& mat) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_id(0), m_end(mat.nonZeros()) + explicit InnerIterator(const SparseCompressedBase& mat) : InnerIterator(mat, Index(0)) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); } @@ -305,6 +343,138 @@ class SparseCompressedBase::ReverseInnerIterator namespace internal { +// modified from https://artificial-mind.net/blog/2020/11/28/std-sort-multiple-ranges +template +class CompressedStorageIterator; + +// wrapper class analogous to std::pair +// used to define assignment, swap, and comparison operators for CompressedStorageIterator +template +class StorageRef +{ +public: + using value_type = std::pair; + + inline StorageRef& operator=(const StorageRef& other) { + *m_innerIndexIterator = *other.m_innerIndexIterator; + *m_valueIterator = *other.m_valueIterator; + return *this; + } + inline StorageRef& operator=(const value_type& other) { + std::tie(*m_innerIndexIterator, *m_valueIterator) = other; + return *this; + } + inline operator value_type() const { return std::make_pair(*m_innerIndexIterator, *m_valueIterator); } + inline friend void swap(const StorageRef& a, const StorageRef& b) { + std::iter_swap(a.m_innerIndexIterator, b.m_innerIndexIterator); + std::iter_swap(a.m_valueIterator, b.m_valueIterator); + } + + inline static const StorageIndex& key(const StorageRef& a) { return *a.m_innerIndexIterator; } + inline static const StorageIndex& key(const value_type& a) { return a.first; } + #define REF_COMP_REF(OP) inline friend bool operator OP(const StorageRef& a, const StorageRef& b) { return key(a) OP key(b); }; + #define REF_COMP_VAL(OP) inline friend bool operator OP(const StorageRef& a, const value_type& b) { return key(a) OP key(b); }; + #define VAL_COMP_REF(OP) inline friend bool operator OP(const value_type& a, const StorageRef& b) { return key(a) OP key(b); }; + #define MAKE_COMPS(OP) REF_COMP_REF(OP) REF_COMP_VAL(OP) VAL_COMP_REF(OP) + MAKE_COMPS(<) MAKE_COMPS(>) MAKE_COMPS(<=) MAKE_COMPS(>=) MAKE_COMPS(==) MAKE_COMPS(!=) + +protected: + StorageIndex* m_innerIndexIterator; + Scalar* m_valueIterator; +private: + StorageRef() = delete; + // these constructors are only called by the CompressedStorageIterator constructors for convenience only + StorageRef(StorageIndex* innerIndexIterator, Scalar* valueIterator) : m_innerIndexIterator(innerIndexIterator), m_valueIterator(valueIterator) {} + StorageRef(const StorageRef& other) : m_innerIndexIterator(other.m_innerIndexIterator), m_valueIterator(other.m_valueIterator) {} + + friend class CompressedStorageIterator; +}; + +// STL-compatible iterator class that operates on inner indices and values +template +class CompressedStorageIterator +{ +public: + using iterator_category = std::random_access_iterator_tag; + using reference = StorageRef; + using difference_type = Index; + using value_type = typename reference::value_type; + using pointer = value_type*; + + CompressedStorageIterator() = delete; + CompressedStorageIterator(difference_type index, StorageIndex* innerIndexPtr, Scalar* valuePtr) : m_index(index), m_data(innerIndexPtr, valuePtr) {} + CompressedStorageIterator(difference_type index, reference data) : m_index(index), m_data(data) {} + CompressedStorageIterator(const CompressedStorageIterator& other) : m_index(other.m_index), m_data(other.m_data) {} + inline CompressedStorageIterator& operator=(const CompressedStorageIterator& other) { + m_index = other.m_index; + m_data = other.m_data; + return *this; + } + + inline CompressedStorageIterator operator+(difference_type offset) const { return CompressedStorageIterator(m_index + offset, m_data); } + inline CompressedStorageIterator operator-(difference_type offset) const { return CompressedStorageIterator(m_index - offset, m_data); } + inline difference_type operator-(const CompressedStorageIterator& other) const { return m_index - other.m_index; } + inline CompressedStorageIterator& operator++() { ++m_index; return *this; } + inline CompressedStorageIterator& operator--() { --m_index; return *this; } + inline CompressedStorageIterator& operator+=(difference_type offset) { m_index += offset; return *this; } + inline CompressedStorageIterator& operator-=(difference_type offset) { m_index -= offset; return *this; } + inline reference operator*() const { return reference(m_data.m_innerIndexIterator + m_index, m_data.m_valueIterator + m_index); } + + #define MAKE_COMP(OP) inline bool operator OP(const CompressedStorageIterator& other) const { return m_index OP other.m_index; } + MAKE_COMP(<) MAKE_COMP(>) MAKE_COMP(>=) MAKE_COMP(<=) MAKE_COMP(!=) MAKE_COMP(==) + +protected: + difference_type m_index; + reference m_data; +}; + +template +struct inner_sort_impl { + typedef typename Derived::Scalar Scalar; + typedef typename Derived::StorageIndex StorageIndex; + static inline void run(SparseCompressedBase& obj, Index begin, Index end) { + const bool is_compressed = obj.isCompressed(); + for (Index outer = begin; outer < end; outer++) { + Index begin_offset = obj.outerIndexPtr()[outer]; + Index end_offset = is_compressed ? obj.outerIndexPtr()[outer + 1] : (begin_offset + obj.innerNonZeroPtr()[outer]); + CompressedStorageIterator begin_it(begin_offset, obj.innerIndexPtr(), obj.valuePtr()); + CompressedStorageIterator end_it(end_offset, obj.innerIndexPtr(), obj.valuePtr()); + std::sort(begin_it, end_it, Comp()); + } + } + static inline Index check(const SparseCompressedBase& obj, Index begin, Index end) { + const bool is_compressed = obj.isCompressed(); + for (Index outer = begin; outer < end; outer++) { + Index begin_offset = obj.outerIndexPtr()[outer]; + Index end_offset = is_compressed ? obj.outerIndexPtr()[outer + 1] : (begin_offset + obj.innerNonZeroPtr()[outer]); + const StorageIndex* begin_it = obj.innerIndexPtr() + begin_offset; + const StorageIndex* end_it = obj.innerIndexPtr() + end_offset; + bool is_sorted = std::is_sorted(begin_it, end_it, Comp()); + if (!is_sorted) return outer; + } + return end; + } +}; +template +struct inner_sort_impl { + typedef typename Derived::Scalar Scalar; + typedef typename Derived::StorageIndex StorageIndex; + static inline void run(SparseCompressedBase& obj, Index, Index) { + Index begin_offset = 0; + Index end_offset = obj.nonZeros(); + CompressedStorageIterator begin_it(begin_offset, obj.innerIndexPtr(), obj.valuePtr()); + CompressedStorageIterator end_it(end_offset, obj.innerIndexPtr(), obj.valuePtr()); + std::sort(begin_it, end_it, Comp()); + } + static inline Index check(const SparseCompressedBase& obj, Index, Index) { + Index begin_offset = 0; + Index end_offset = obj.nonZeros(); + const StorageIndex* begin_it = obj.innerIndexPtr() + begin_offset; + const StorageIndex* end_it = obj.innerIndexPtr() + end_offset; + return std::is_sorted(begin_it, end_it, Comp()) ? 1 : 0; + } +}; + template struct evaluator > : evaluator_base diff --git a/libs/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/libs/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 9b0d3f9..17cdb8e 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_CWISE_BINARY_OP_H #define EIGEN_SPARSE_CWISE_BINARY_OP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { // Here we have to handle 3 cases: @@ -40,14 +42,11 @@ class CwiseBinaryOpImpl typedef CwiseBinaryOp Derived; typedef SparseMatrixBase Base; EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) - CwiseBinaryOpImpl() - { - EIGEN_STATIC_ASSERT(( - (!internal::is_same::StorageKind, - typename internal::traits::StorageKind>::value) - || ((internal::evaluator::Flags&RowMajorBit) == (internal::evaluator::Flags&RowMajorBit))), - THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH); - } + EIGEN_STATIC_ASSERT(( + (!internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value) + || ((internal::evaluator::Flags&RowMajorBit) == (internal::evaluator::Flags&RowMajorBit))), + THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH) }; namespace internal { diff --git a/libs/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/libs/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index 32dac0f..6f48fa7 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_CWISE_UNARY_OP_H #define EIGEN_SPARSE_CWISE_UNARY_OP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseCore/SparseDenseProduct.h b/libs/eigen/Eigen/src/SparseCore/SparseDenseProduct.h index f005a18..9c0c531 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseDenseProduct.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEDENSEPRODUCT_H #define EIGEN_SPARSEDENSEPRODUCT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -26,9 +28,9 @@ struct sparse_time_dense_product_impl; template struct sparse_time_dense_product_impl { - typedef typename internal::remove_all::type Lhs; - typedef typename internal::remove_all::type Rhs; - typedef typename internal::remove_all::type Res; + typedef internal::remove_all_t Lhs; + typedef internal::remove_all_t Rhs; + typedef internal::remove_all_t Res; typedef typename evaluator::InnerIterator LhsInnerIterator; typedef evaluator LhsEval; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) @@ -63,18 +65,26 @@ struct sparse_time_dense_product_impl let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators -// template -// struct ScalarBinaryOpTraits > +// template +// struct ScalarBinaryOpTraits > // { // enum { // Defined = 1 @@ -85,9 +95,9 @@ struct sparse_time_dense_product_impl struct sparse_time_dense_product_impl { - typedef typename internal::remove_all::type Lhs; - typedef typename internal::remove_all::type Rhs; - typedef typename internal::remove_all::type Res; + typedef internal::remove_all_t Lhs; + typedef internal::remove_all_t Rhs; + typedef internal::remove_all_t Res; typedef evaluator LhsEval; typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) @@ -109,9 +119,9 @@ struct sparse_time_dense_product_impl struct sparse_time_dense_product_impl { - typedef typename internal::remove_all::type Lhs; - typedef typename internal::remove_all::type Rhs; - typedef typename internal::remove_all::type Res; + typedef internal::remove_all_t Lhs; + typedef internal::remove_all_t Rhs; + typedef internal::remove_all_t Res; typedef evaluator LhsEval; typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) @@ -149,9 +159,9 @@ struct sparse_time_dense_product_impl struct sparse_time_dense_product_impl { - typedef typename internal::remove_all::type Lhs; - typedef typename internal::remove_all::type Rhs; - typedef typename internal::remove_all::type Res; + typedef internal::remove_all_t Lhs; + typedef internal::remove_all_t Rhs; + typedef internal::remove_all_t Res; typedef typename evaluator::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { @@ -226,16 +236,16 @@ template struct sparse_dense_outer_product_evaluator { protected: - typedef typename conditional::type Lhs1; - typedef typename conditional::type ActualRhs; + typedef std::conditional_t Lhs1; + typedef std::conditional_t ActualRhs; typedef Product ProdXprType; // if the actual left-hand side is a dense vector, // then build a sparse-view so that we can seamlessly iterate over it. - typedef typename conditional::StorageKind,Sparse>::value, - Lhs1, SparseView >::type ActualLhs; - typedef typename conditional::StorageKind,Sparse>::value, - Lhs1 const&, SparseView >::type LhsArg; + typedef std::conditional_t::StorageKind,Sparse>::value, + Lhs1, SparseView > ActualLhs; + typedef std::conditional_t::StorageKind,Sparse>::value, + Lhs1 const&, SparseView > LhsArg; typedef evaluator LhsEval; typedef evaluator RhsEval; diff --git a/libs/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h b/libs/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h index 941c03b..4dc9502 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_DIAGONAL_PRODUCT_H #define EIGEN_SPARSE_DIAGONAL_PRODUCT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { // The product of a diagonal matrix with a sparse matrix can be easily diff --git a/libs/eigen/Eigen/src/SparseCore/SparseDot.h b/libs/eigen/Eigen/src/SparseCore/SparseDot.h index 38bc4aa..a45ecfa 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseDot.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseDot.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_DOT_H #define EIGEN_SPARSE_DOT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template diff --git a/libs/eigen/Eigen/src/SparseCore/SparseFuzzy.h b/libs/eigen/Eigen/src/SparseCore/SparseFuzzy.h index 7d47eb9..dcfdde9 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseFuzzy.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseFuzzy.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_FUZZY_H #define EIGEN_SPARSE_FUZZY_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template @@ -17,9 +19,9 @@ template bool SparseMatrixBase::isApprox(const SparseMatrixBase& other, const RealScalar &prec) const { const typename internal::nested_eval::type actualA(derived()); - typename internal::conditional::type, - const PlainObject>::type actualB(other.derived()); + const PlainObject> actualB(other.derived()); return (actualA - actualB).squaredNorm() <= prec * prec * numext::mini(actualA.squaredNorm(), actualB.squaredNorm()); } diff --git a/libs/eigen/Eigen/src/SparseCore/SparseMap.h b/libs/eigen/Eigen/src/SparseCore/SparseMap.h index f99be33..0ee3813 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseMap.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseMap.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_MAP_H #define EIGEN_SPARSE_MAP_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -58,12 +60,12 @@ class SparseMapBase using Base::operator=; protected: - typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar *, const Scalar *>::type ScalarPointer; - typedef typename internal::conditional< - bool(internal::is_lvalue::value), - StorageIndex *, const StorageIndex *>::type IndexPointer; + typedef std::conditional_t< + bool(internal::is_lvalue::value), + Scalar *, const Scalar *> ScalarPointer; + typedef std::conditional_t< + bool(internal::is_lvalue::value), + StorageIndex *, const StorageIndex *> IndexPointer; Index m_outerSize; Index m_innerSize; @@ -237,6 +239,7 @@ class Map /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients, * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr. * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed. + * The inner indices must be sorted appropriately. * * This constructor is available only if \c SparseMatrixType is non-const. * diff --git a/libs/eigen/Eigen/src/SparseCore/SparseMatrix.h b/libs/eigen/Eigen/src/SparseCore/SparseMatrix.h index 616b4a0..6806812 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseMatrix.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseMatrix.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEMATRIX_H #define EIGEN_SPARSEMATRIX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup SparseCore_Module @@ -29,10 +31,10 @@ namespace Eigen { * * More details on this storage sceheme are given in the \ref TutorialSparse "manual pages". * - * \tparam _Scalar the scalar type, i.e. the type of the coefficients - * \tparam _Options Union of bit flags controlling the storage scheme. Currently the only possibility + * \tparam Scalar_ the scalar type, i.e. the type of the coefficients + * \tparam Options_ Union of bit flags controlling the storage scheme. Currently the only possibility * is ColMajor or RowMajor. The default is 0 which means column-major. - * \tparam _StorageIndex the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int. + * \tparam StorageIndex_ the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int. * * \warning In %Eigen 3.2, the undocumented type \c SparseMatrix::Index was improperly defined as the storage index type (e.g., int), * whereas it is now (starting from %Eigen 3.3) deprecated and always defined as Eigen::Index. @@ -43,11 +45,11 @@ namespace Eigen { */ namespace internal { -template -struct traits > +template +struct traits > { - typedef _Scalar Scalar; - typedef _StorageIndex StorageIndex; + typedef Scalar_ Scalar; + typedef StorageIndex_ StorageIndex; typedef Sparse StorageKind; typedef MatrixXpr XprKind; enum { @@ -55,21 +57,21 @@ struct traits > ColsAtCompileTime = Dynamic, MaxRowsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic, - Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit, + Flags = Options_ | NestByRefBit | LvalueBit | CompressedAccessBit, SupportedAccessPatterns = InnerRandomAccessPattern }; }; -template -struct traits, DiagIndex> > +template +struct traits, DiagIndex> > { - typedef SparseMatrix<_Scalar, _Options, _StorageIndex> MatrixType; + typedef SparseMatrix MatrixType; typedef typename ref_selector::type MatrixTypeNested; - typedef typename remove_reference::type _MatrixTypeNested; + typedef std::remove_reference_t MatrixTypeNested_; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Dense StorageKind; - typedef _StorageIndex StorageIndex; + typedef StorageIndex_ StorageIndex; typedef MatrixXpr XprKind; enum { @@ -81,9 +83,9 @@ struct traits, DiagIndex }; }; -template -struct traits, DiagIndex> > - : public traits, DiagIndex> > +template +struct traits, DiagIndex> > + : public traits, DiagIndex> > { enum { Flags = 0 @@ -92,13 +94,13 @@ struct traits, Dia } // end namespace internal -template +template class SparseMatrix - : public SparseCompressedBase > + : public SparseCompressedBase > { typedef SparseCompressedBase Base; using Base::convert_index; - friend class SparseVector<_Scalar,0,_StorageIndex>; + friend class SparseVector; template friend struct internal::Assignment; public: @@ -108,7 +110,7 @@ class SparseMatrix using Base::operator+=; using Base::operator-=; - typedef MappedSparseMatrix Map; + typedef Eigen::Map> Map; typedef Diagonal DiagonalReturnType; typedef Diagonal ConstDiagonalReturnType; typedef typename Base::InnerIterator InnerIterator; @@ -118,13 +120,13 @@ class SparseMatrix using Base::IsRowMajor; typedef internal::CompressedStorage Storage; enum { - Options = _Options + Options = Options_ }; typedef typename Base::IndexVector IndexVector; typedef typename Base::ScalarVector ScalarVector; protected: - typedef SparseMatrix TransposedSparseMatrix; + typedef SparseMatrix TransposedSparseMatrix; Index m_outerSize; Index m_innerSize; @@ -253,9 +255,10 @@ class SparseMatrix inline void setZero() { m_data.clear(); - memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex)); - if(m_innerNonZeros) - memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); + std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0)); + if(m_innerNonZeros) { + std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0)); + } } /** Preallocates \a reserveSize non zeros. @@ -285,10 +288,7 @@ class SparseMatrix #else template inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif = - #if (!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1500) // MSVC 2005 fails to compile with this typename - typename - #endif - SizesType::value_type()) + typename SizesType::value_type()) { EIGEN_UNUSED_VARIABLE(enableif); reserveInnerVectors(reserveSizes); @@ -302,8 +302,7 @@ class SparseMatrix { Index totalReserveSize = 0; // turn the matrix into non-compressed mode - m_innerNonZeros = static_cast(std::malloc(m_outerSize * sizeof(StorageIndex))); - if (!m_innerNonZeros) internal::throw_std_bad_alloc(); + m_innerNonZeros = internal::conditional_aligned_new_auto(m_outerSize); // temporarily use m_innerSizes to hold the new starting points. StorageIndex* newOuterIndex = m_innerNonZeros; @@ -336,8 +335,7 @@ class SparseMatrix } else { - StorageIndex* newOuterIndex = static_cast(std::malloc((m_outerSize+1)*sizeof(StorageIndex))); - if (!newOuterIndex) internal::throw_std_bad_alloc(); + StorageIndex* newOuterIndex = internal::conditional_aligned_new_auto(m_outerSize + 1); StorageIndex count = 0; for(Index j=0; j(newOuterIndex, m_outerSize + 1); } } @@ -488,7 +486,7 @@ class SparseMatrix m_outerIndex[j+1] = m_outerIndex[j] + m_innerNonZeros[j]; oldStart = nextOldStart; } - std::free(m_innerNonZeros); + internal::conditional_aligned_delete_auto(m_innerNonZeros, m_outerSize); m_innerNonZeros = 0; m_data.resize(m_outerIndex[m_outerSize]); m_data.squeeze(); @@ -499,7 +497,7 @@ class SparseMatrix { if(m_innerNonZeros != 0) return; - m_innerNonZeros = static_cast(std::malloc(m_outerSize * sizeof(StorageIndex))); + m_innerNonZeros = internal::conditional_aligned_new_auto(m_outerSize); for (Index i = 0; i < m_outerSize; i++) { m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; @@ -569,9 +567,8 @@ class SparseMatrix if (m_innerNonZeros) { // Resize m_innerNonZeros - StorageIndex *newInnerNonZeros = static_cast(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(StorageIndex))); - if (!newInnerNonZeros) internal::throw_std_bad_alloc(); - m_innerNonZeros = newInnerNonZeros; + m_innerNonZeros = internal::conditional_aligned_realloc_new_auto( + m_innerNonZeros, m_outerSize + outerChange, m_outerSize); for(Index i=m_outerSize; i(std::malloc((m_outerSize + outerChange) * sizeof(StorageIndex))); - if (!m_innerNonZeros) internal::throw_std_bad_alloc(); + m_innerNonZeros = internal::conditional_aligned_new_auto(m_outerSize + outerChange); for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++) m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; for(Index i = m_outerSize; i < m_outerSize + outerChange; i++) @@ -604,9 +600,8 @@ class SparseMatrix if (outerChange == 0) return; - StorageIndex *newOuterIndex = static_cast(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(StorageIndex))); - if (!newOuterIndex) internal::throw_std_bad_alloc(); - m_outerIndex = newOuterIndex; + m_outerIndex = internal::conditional_aligned_realloc_new_auto( + m_outerIndex, m_outerSize + outerChange + 1, m_outerSize + 1); if (outerChange > 0) { StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; @@ -630,18 +625,16 @@ class SparseMatrix m_data.clear(); if (m_outerSize != outerSize || m_outerSize==0) { - std::free(m_outerIndex); - m_outerIndex = static_cast(std::malloc((outerSize + 1) * sizeof(StorageIndex))); - if (!m_outerIndex) internal::throw_std_bad_alloc(); - + m_outerIndex = internal::conditional_aligned_realloc_new_auto(m_outerIndex, outerSize + 1, + m_outerSize + 1); m_outerSize = outerSize; } if(m_innerNonZeros) { - std::free(m_innerNonZeros); + internal::conditional_aligned_delete_auto(m_innerNonZeros, m_outerSize); m_innerNonZeros = 0; } - memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex)); + std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0)); } /** \internal @@ -664,7 +657,6 @@ class SparseMatrix inline SparseMatrix() : m_outerSize(-1), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) { - check_template_parameters(); resize(0, 0); } @@ -672,7 +664,6 @@ class SparseMatrix inline SparseMatrix(Index rows, Index cols) : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) { - check_template_parameters(); resize(rows, cols); } @@ -683,7 +674,6 @@ class SparseMatrix { EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - check_template_parameters(); const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator::Flags & RowMajorBit); if (needToTranspose) *this = other.derived(); @@ -695,21 +685,24 @@ class SparseMatrix internal::call_assignment_no_alias(*this, other.derived()); } } - + /** Constructs a sparse matrix from the sparse selfadjoint view \a other */ template inline SparseMatrix(const SparseSelfAdjointView& other) : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) { - check_template_parameters(); Base::operator=(other); } + inline SparseMatrix(SparseMatrix&& other) : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) + { + *this = other.derived().markAsRValue(); + } + /** Copy constructor (it performs a deep copy) */ inline SparseMatrix(const SparseMatrix& other) : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) { - check_template_parameters(); *this = other.derived(); } @@ -718,17 +711,15 @@ class SparseMatrix SparseMatrix(const ReturnByValue& other) : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) { - check_template_parameters(); initAssignment(other); other.evalTo(*this); } - + /** \brief Copy constructor with in-place evaluation */ template explicit SparseMatrix(const DiagonalBase& other) : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) { - check_template_parameters(); *this = other.derived(); } @@ -753,9 +744,10 @@ class SparseMatrix Eigen::Map(this->m_data.indexPtr(), rows()).setLinSpaced(0, StorageIndex(rows()-1)); Eigen::Map(this->m_data.valuePtr(), rows()).setOnes(); Eigen::Map(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows())); - std::free(m_innerNonZeros); + internal::conditional_aligned_delete_auto(m_innerNonZeros, m_outerSize); m_innerNonZeros = 0; } + inline SparseMatrix& operator=(const SparseMatrix& other) { if (other.isRValue()) @@ -781,6 +773,10 @@ class SparseMatrix return *this; } + inline SparseMatrix& operator=(SparseMatrix&& other) { + return *this = other.derived().markAsRValue(); + } + #ifndef EIGEN_PARSED_BY_DOXYGEN template inline SparseMatrix& operator=(const EigenBase& other) @@ -793,6 +789,7 @@ class SparseMatrix template EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase& other); +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m) { EIGEN_DBG_SPARSE( @@ -837,12 +834,13 @@ class SparseMatrix s << static_cast&>(m); return s; } +#endif /** Destructor */ inline ~SparseMatrix() { - std::free(m_outerIndex); - std::free(m_innerNonZeros); + internal::conditional_aligned_delete_auto(m_outerIndex, m_outerSize + 1); + internal::conditional_aligned_delete_auto(m_innerNonZeros, m_outerSize); } /** Overloaded for performance */ @@ -860,7 +858,7 @@ protected: resize(other.rows(), other.cols()); if(m_innerNonZeros) { - std::free(m_innerNonZeros); + internal::conditional_aligned_delete_auto(m_innerNonZeros, m_outerSize); m_innerNonZeros = 0; } } @@ -1012,11 +1010,8 @@ protected: } private: - static void check_template_parameters() - { - EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); - EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS); - } + EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE) + EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS) struct default_prunning_func { default_prunning_func(const Scalar& ref, const RealScalar& eps) : reference(ref), epsilon(eps) {} @@ -1103,11 +1098,11 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather * be explicitly stored into a std::vector for instance. */ -template +template template -void SparseMatrix::setFromTriplets(const InputIterators& begin, const InputIterators& end) +void SparseMatrix::setFromTriplets(const InputIterators& begin, const InputIterators& end) { - internal::set_from_triplets >(begin, end, *this, internal::scalar_sum_op()); + internal::set_from_triplets >(begin, end, *this, internal::scalar_sum_op()); } /** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied: @@ -1119,17 +1114,17 @@ void SparseMatrix::setFromTriplets(const InputIte * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; }); * \endcode */ -template +template template -void SparseMatrix::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func) +void SparseMatrix::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func) { - internal::set_from_triplets, DupFunctor>(begin, end, *this, dup_func); + internal::set_from_triplets, DupFunctor>(begin, end, *this, dup_func); } /** \internal */ -template +template template -void SparseMatrix::collapseDuplicates(DupFunctor dup_func) +void SparseMatrix::collapseDuplicates(DupFunctor dup_func) { eigen_assert(!isCompressed()); // TODO, in practice we should be able to use m_innerNonZeros for that task @@ -1162,14 +1157,14 @@ void SparseMatrix::collapseDuplicates(DupFunctor m_outerIndex[m_outerSize] = count; // turn the matrix into compressed form - std::free(m_innerNonZeros); + internal::conditional_aligned_delete_auto(m_innerNonZeros, m_outerSize); m_innerNonZeros = 0; m_data.resize(m_outerIndex[m_outerSize]); } -template +template template -EIGEN_DONT_INLINE SparseMatrix& SparseMatrix::operator=(const SparseMatrixBase& other) +EIGEN_DONT_INLINE SparseMatrix& SparseMatrix::operator=(const SparseMatrixBase& other) { EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) @@ -1189,8 +1184,8 @@ EIGEN_DONT_INLINE SparseMatrix& SparseMatrix::type >::type OtherCopy; - typedef typename internal::remove_all::type _OtherCopy; - typedef internal::evaluator<_OtherCopy> OtherCopyEval; + typedef internal::remove_all_t OtherCopy_; + typedef internal::evaluator OtherCopyEval; OtherCopy otherCopy(other.derived()); OtherCopyEval otherCopyEval(otherCopy); @@ -1240,8 +1235,8 @@ EIGEN_DONT_INLINE SparseMatrix& SparseMatrix -typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insert(Index row, Index col) +template +typename SparseMatrix::Scalar& SparseMatrix::insert(Index row, Index col) { eigen_assert(row>=0 && row=0 && col::Scalar& SparseMatrix<_Sca m_data.reserve(2*m_innerSize); // turn the matrix into non-compressed mode - m_innerNonZeros = static_cast(std::malloc(m_outerSize * sizeof(StorageIndex))); - if(!m_innerNonZeros) internal::throw_std_bad_alloc(); + m_innerNonZeros = internal::conditional_aligned_new_auto(m_outerSize); - memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); + std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0)); // pack all inner-vectors to the end of the pre-allocated space // and allocate the entire free-space to the first inner-vector @@ -1271,8 +1265,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca else { // turn the matrix into non-compressed mode - m_innerNonZeros = static_cast(std::malloc(m_outerSize * sizeof(StorageIndex))); - if(!m_innerNonZeros) internal::throw_std_bad_alloc(); + m_innerNonZeros = internal::conditional_aligned_new_auto(m_outerSize); for(Index j=0; j::Scalar& SparseMatrix<_Sca return insertUncompressed(row,col); } -template -EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertUncompressed(Index row, Index col) +template +EIGEN_DONT_INLINE typename SparseMatrix::Scalar& SparseMatrix::insertUncompressed(Index row, Index col) { eigen_assert(!isCompressed()); @@ -1392,8 +1385,8 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& return (m_data.value(p) = Scalar(0)); } -template -EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertCompressed(Index row, Index col) +template +EIGEN_DONT_INLINE typename SparseMatrix::Scalar& SparseMatrix::insertCompressed(Index row, Index col) { eigen_assert(isCompressed()); @@ -1501,18 +1494,138 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& namespace internal { -template -struct evaluator > - : evaluator > > +template +struct evaluator > + : evaluator > > { - typedef evaluator > > Base; - typedef SparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType; + typedef evaluator > > Base; + typedef SparseMatrix SparseMatrixType; evaluator() : Base() {} explicit evaluator(const SparseMatrixType &mat) : Base(mat) {} }; } +// Specialization for SparseMatrix. +// Serializes [rows, cols, isCompressed, outerSize, innerBufferSize, +// innerNonZeros, outerIndices, innerIndices, values]. +template +class Serializer, void> { + public: + typedef SparseMatrix SparseMat; + + struct Header { + typename SparseMat::Index rows; + typename SparseMat::Index cols; + bool compressed; + Index outer_size; + Index inner_buffer_size; + }; + + EIGEN_DEVICE_FUNC size_t size(const SparseMat& value) const { + // innerNonZeros. + std::size_t num_storage_indices = value.isCompressed() ? 0 : value.outerSize(); + // Outer indices. + num_storage_indices += value.outerSize() + 1; + // Inner indices. + const StorageIndex inner_buffer_size = value.outerIndexPtr()[value.outerSize()]; + num_storage_indices += inner_buffer_size; + // Values. + std::size_t num_values = inner_buffer_size; + return sizeof(Header) + sizeof(Scalar) * num_values + + sizeof(StorageIndex) * num_storage_indices; + } + + EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, + const SparseMat& value) { + if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr; + + const size_t header_bytes = sizeof(Header); + Header header = {value.rows(), value.cols(), value.isCompressed(), + value.outerSize(), value.outerIndexPtr()[value.outerSize()]}; + EIGEN_USING_STD(memcpy) + memcpy(dest, &header, header_bytes); + dest += header_bytes; + + // innerNonZeros. + if (!header.compressed) { + std::size_t data_bytes = sizeof(StorageIndex) * header.outer_size; + memcpy(dest, value.innerNonZeroPtr(), data_bytes); + dest += data_bytes; + } + + // Outer indices. + std::size_t data_bytes = sizeof(StorageIndex) * (header.outer_size + 1); + memcpy(dest, value.outerIndexPtr(), data_bytes); + dest += data_bytes; + + // Inner indices. + data_bytes = sizeof(StorageIndex) * header.inner_buffer_size; + memcpy(dest, value.innerIndexPtr(), data_bytes); + dest += data_bytes; + + // Values. + data_bytes = sizeof(Scalar) * header.inner_buffer_size; + memcpy(dest, value.valuePtr(), data_bytes); + dest += data_bytes; + + return dest; + } + + EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, + const uint8_t* end, + SparseMat& value) const { + if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr; + + const size_t header_bytes = sizeof(Header); + Header header; + EIGEN_USING_STD(memcpy) + memcpy(&header, src, header_bytes); + src += header_bytes; + + value.setZero(); + value.resize(header.rows, header.cols); + if (header.compressed) { + value.makeCompressed(); + } else { + value.uncompress(); + } + + // Adjust value ptr size. + value.data().resize(header.inner_buffer_size); + + // Initialize compressed state and inner non-zeros. + if (!header.compressed) { + // Inner non-zero counts. + std::size_t data_bytes = sizeof(StorageIndex) * header.outer_size; + if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr; + memcpy(value.innerNonZeroPtr(), src, data_bytes); + src += data_bytes; + } + + // Outer indices. + std::size_t data_bytes = sizeof(StorageIndex) * (header.outer_size + 1); + if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr; + memcpy(value.outerIndexPtr(), src, data_bytes); + src += data_bytes; + + // Inner indices. + data_bytes = sizeof(StorageIndex) * header.inner_buffer_size; + if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr; + memcpy(value.innerIndexPtr(), src, data_bytes); + src += data_bytes; + + // Values. + data_bytes = sizeof(Scalar) * header.inner_buffer_size; + if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr; + memcpy(value.valuePtr(), src, data_bytes); + src += data_bytes; + return src; + } +}; + } // end namespace Eigen #endif // EIGEN_SPARSEMATRIX_H diff --git a/libs/eigen/Eigen/src/SparseCore/SparseMatrixBase.h b/libs/eigen/Eigen/src/SparseCore/SparseMatrixBase.h index 229449f..dc78c2e 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseMatrixBase.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEMATRIXBASE_H #define EIGEN_SPARSEMATRIXBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup SparseCore_Module @@ -69,8 +71,7 @@ template class SparseMatrixBase * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */ - SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, - internal::traits::ColsAtCompileTime>::ret), + SizeAtCompileTime = (internal::size_of_xpr_at_compile_time::ret), /**< This is equal to the number of coefficients, i.e. the number of * rows times the number of columns, or to \a Dynamic if this is not * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ @@ -78,8 +79,7 @@ template class SparseMatrixBase MaxRowsAtCompileTime = RowsAtCompileTime, MaxColsAtCompileTime = ColsAtCompileTime, - MaxSizeAtCompileTime = (internal::size_at_compile_time::ret), + MaxSizeAtCompileTime = internal::size_at_compile_time(MaxRowsAtCompileTime, MaxColsAtCompileTime), IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1, /**< This is set to true if either the number of rows or the number of @@ -103,17 +103,17 @@ template class SparseMatrixBase : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime), #ifndef EIGEN_PARSED_BY_DOXYGEN - _HasDirectAccess = (int(Flags)&DirectAccessBit) ? 1 : 0 // workaround sunCC + HasDirectAccess_ = (int(Flags)&DirectAccessBit) ? 1 : 0 // workaround sunCC #endif }; /** \internal the return type of MatrixBase::adjoint() */ - typedef typename internal::conditional::IsComplex, + typedef std::conditional_t::IsComplex, CwiseUnaryOp, Eigen::Transpose >, Transpose - >::type AdjointReturnType; + > AdjointReturnType; typedef Transpose TransposeReturnType; - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; // FIXME storage order do not match evaluator storage order typedef SparseMatrix PlainObject; @@ -129,7 +129,7 @@ template class SparseMatrixBase /** \internal the return type of coeff() */ - typedef typename internal::conditional<_HasDirectAccess, const Scalar&, Scalar>::type CoeffReturnType; + typedef std::conditional_t CoeffReturnType; /** \internal Represents a matrix with all coefficients equal to one another*/ typedef CwiseNullaryOp,Matrix > ConstantReturnType; @@ -137,8 +137,8 @@ template class SparseMatrixBase /** type of the equivalent dense matrix */ typedef Matrix DenseMatrixType; /** type of the equivalent square matrix */ - typedef Matrix SquareMatrixType; + typedef Matrix SquareMatrixType; inline const Derived& derived() const { return *static_cast(this); } inline Derived& derived() { return *static_cast(this); } @@ -214,11 +214,11 @@ template class SparseMatrixBase inline void assignGeneric(const OtherDerived& other); public: - +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m) { typedef typename Derived::Nested Nested; - typedef typename internal::remove_all::type NestedCleaned; + typedef internal::remove_all_t NestedCleaned; if (Flags&RowMajorBit) { @@ -263,6 +263,7 @@ template class SparseMatrixBase } return s; } +#endif template Derived& operator+=(const SparseMatrixBase& other); diff --git a/libs/eigen/Eigen/src/SparseCore/SparsePermutation.h b/libs/eigen/Eigen/src/SparseCore/SparsePermutation.h index ef38357..af9a1fe 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparsePermutation.h +++ b/libs/eigen/Eigen/src/SparseCore/SparsePermutation.h @@ -12,6 +12,8 @@ // This file implements sparse * permutation products +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -20,7 +22,7 @@ template struct permutation_matrix_product { typedef typename nested_eval::type MatrixType; - typedef typename remove_all::type MatrixTypeCleaned; + typedef remove_all_t MatrixTypeCleaned; typedef typename MatrixTypeCleaned::Scalar Scalar; typedef typename MatrixTypeCleaned::StorageIndex StorageIndex; @@ -30,9 +32,9 @@ struct permutation_matrix_product MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight }; - typedef typename internal::conditional, - SparseMatrix >::type ReturnType; + SparseMatrix > ReturnType; template static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) @@ -107,7 +109,7 @@ struct product_evaluator, ProductTag, Permut explicit product_evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); generic_product_impl::evalTo(m_result, xpr.lhs(), xpr.rhs()); } diff --git a/libs/eigen/Eigen/src/SparseCore/SparseProduct.h b/libs/eigen/Eigen/src/SparseCore/SparseProduct.h index af8a774..85a8a10 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseProduct.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseProduct.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEPRODUCT_H #define EIGEN_SPARSEPRODUCT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \returns an expression of the product of two sparse matrices. @@ -45,19 +47,19 @@ struct generic_product_impl // dense += sparse * sparse template - static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, typename enable_if::Shape,DenseShape>::value,int*>::type* = 0) + static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, std::enable_if_t::Shape,DenseShape>::value,int*>* = 0) { typedef typename nested_eval::type LhsNested; typedef typename nested_eval::type RhsNested; LhsNested lhsNested(lhs); RhsNested rhsNested(rhs); - internal::sparse_sparse_to_dense_product_selector::type, - typename remove_all::type, Dest>::run(lhsNested,rhsNested,dst); + internal::sparse_sparse_to_dense_product_selector, + remove_all_t, Dest>::run(lhsNested,rhsNested,dst); } // dense -= sparse * sparse template - static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, typename enable_if::Shape,DenseShape>::value,int*>::type* = 0) + static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, std::enable_if_t::Shape,DenseShape>::value,int*>* = 0) { addTo(dst, -lhs, rhs); } @@ -72,8 +74,8 @@ protected: typedef typename nested_eval::type RhsNested; LhsNested lhsNested(lhs); RhsNested rhsNested(rhs); - internal::conservative_sparse_sparse_product_selector::type, - typename remove_all::type, Dest>::run(lhsNested,rhsNested,dst); + internal::conservative_sparse_sparse_product_selector, + remove_all_t, Dest>::run(lhsNested,rhsNested,dst); } // dense = sparse * sparse @@ -147,14 +149,14 @@ struct unary_evaluator >, IteratorBased> : m_result(xpr.rows(), xpr.cols()) { using std::abs; - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); typedef typename nested_eval::type LhsNested; typedef typename nested_eval::type RhsNested; LhsNested lhsNested(xpr.nestedExpression().lhs()); RhsNested rhsNested(xpr.nestedExpression().rhs()); - internal::sparse_sparse_product_with_pruning_selector::type, - typename remove_all::type, PlainObject>::run(lhsNested,rhsNested,m_result, + internal::sparse_sparse_product_with_pruning_selector, + remove_all_t, PlainObject>::run(lhsNested,rhsNested,m_result, abs(xpr.reference())*xpr.epsilon()); } @@ -165,9 +167,9 @@ protected: } // end namespace internal // sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.) -template +template template -SparseMatrix& SparseMatrix::operator=(const Product& src) +SparseMatrix& SparseMatrix::operator=(const Product& src) { // std::cout << "in Assignment : " << DstOptions << "\n"; SparseMatrix dst(src.rows(),src.cols()); diff --git a/libs/eigen/Eigen/src/SparseCore/SparseRedux.h b/libs/eigen/Eigen/src/SparseCore/SparseRedux.h index 4587749..6b14c58 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseRedux.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseRedux.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEREDUX_H #define EIGEN_SPARSEREDUX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template @@ -25,9 +27,9 @@ SparseMatrixBase::sum() const return res; } -template -typename internal::traits >::Scalar -SparseMatrix<_Scalar,_Options,_Index>::sum() const +template +typename internal::traits >::Scalar +SparseMatrix::sum() const { eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix"); if(this->isCompressed()) @@ -36,9 +38,9 @@ SparseMatrix<_Scalar,_Options,_Index>::sum() const return Base::sum(); } -template -typename internal::traits >::Scalar -SparseVector<_Scalar,_Options,_Index>::sum() const +template +typename internal::traits >::Scalar +SparseVector::sum() const { eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix"); return Matrix::Map(m_data.valuePtr(), m_data.size()).sum(); diff --git a/libs/eigen/Eigen/src/SparseCore/SparseRef.h b/libs/eigen/Eigen/src/SparseCore/SparseRef.h index 748f87d..9e69d93 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseRef.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseRef.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_REF_H #define EIGEN_SPARSE_REF_H +#include "./InternalHeaderCheck.h" + namespace Eigen { enum { @@ -20,13 +22,13 @@ namespace internal { template class SparseRefBase; -template -struct traits, _Options, _StrideType> > +template +struct traits, Options_, StrideType_> > : public traits > { typedef SparseMatrix PlainObjectType; enum { - Options = _Options, + Options = Options_, Flags = traits::Flags | CompressedAccessBit | NestByRefBit }; @@ -35,27 +37,27 @@ struct traits, _Options, _Stride StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch }; - typedef typename internal::conditional::type type; + typedef std::conditional_t type; }; }; -template -struct traits, _Options, _StrideType> > - : public traits, _Options, _StrideType> > +template +struct traits, Options_, StrideType_> > + : public traits, Options_, StrideType_> > { enum { Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit }; }; -template -struct traits, _Options, _StrideType> > +template +struct traits, Options_, StrideType_> > : public traits > { typedef SparseVector PlainObjectType; enum { - Options = _Options, + Options = Options_, Flags = traits::Flags | CompressedAccessBit | NestByRefBit }; @@ -63,14 +65,14 @@ struct traits, _Options, _Stride enum { MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && Derived::IsVectorAtCompileTime }; - typedef typename internal::conditional::type type; + typedef std::conditional_t type; }; }; -template -struct traits, _Options, _StrideType> > - : public traits, _Options, _StrideType> > +template +struct traits, Options_, StrideType_> > + : public traits, Options_, StrideType_> > { enum { Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit @@ -98,9 +100,9 @@ protected: void construct(Expression& expr) { if(expr.outerIndexPtr()==0) - ::new (static_cast(this)) Base(expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr()); + internal::construct_at(this, expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr()); else - ::new (static_cast(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr()); + internal::construct_at(this, expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr()); } }; @@ -133,7 +135,7 @@ class Ref template inline Ref(const SparseMatrix& expr); template - inline Ref(const MappedSparseMatrix& expr); + inline Ref(const Map>& expr); public: typedef internal::SparseRefBase Base; @@ -148,15 +150,15 @@ class Ref eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) ); Base::construct(expr.derived()); } - + template - inline Ref(MappedSparseMatrix& expr) + inline Ref(Map >& expr) { EIGEN_STATIC_ASSERT(bool(Traits::template match >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) ); Base::construct(expr.derived()); } - + template inline Ref(const SparseCompressedBase& expr) #else @@ -201,8 +203,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); - obj->~TPlainObjectType(); + internal::destroy_at(reinterpret_cast(&m_storage)); } } @@ -213,8 +214,7 @@ class Ref, Options, StrideType { if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed())) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); - ::new (obj) TPlainObjectType(expr); + TPlainObjectType* obj = internal::construct_at(reinterpret_cast(&m_storage), expr); m_hasCopy = true; Base::construct(*obj); } @@ -227,8 +227,7 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); - ::new (obj) TPlainObjectType(expr); + TPlainObjectType* obj = internal::construct_at(reinterpret_cast(&m_storage), expr); m_hasCopy = true; Base::construct(*obj); } @@ -319,8 +318,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); - obj->~TPlainObjectType(); + internal::destroy_at(reinterpret_cast(&m_storage)); } } @@ -335,8 +333,7 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); - ::new (obj) TPlainObjectType(expr); + TPlainObjectType* obj = internal::construct_at(reinterpret_cast(&m_storage), expr); m_hasCopy = true; Base::construct(*obj); } @@ -355,7 +352,7 @@ struct evaluator, Options, Strid : evaluator, Options, StrideType> > > { typedef evaluator, Options, StrideType> > > Base; - typedef Ref, Options, StrideType> XprType; + typedef Ref, Options, StrideType> XprType; evaluator() : Base() {} explicit evaluator(const XprType &mat) : Base(mat) {} }; @@ -365,7 +362,7 @@ struct evaluator, Options, : evaluator, Options, StrideType> > > { typedef evaluator, Options, StrideType> > > Base; - typedef Ref, Options, StrideType> XprType; + typedef Ref, Options, StrideType> XprType; evaluator() : Base() {} explicit evaluator(const XprType &mat) : Base(mat) {} }; diff --git a/libs/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h b/libs/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h index 85b00e1..211506e 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSE_SELFADJOINTVIEW_H #define EIGEN_SPARSE_SELFADJOINTVIEW_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup SparseCore_Module @@ -40,13 +42,13 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix class SparseSelfAdjointView - : public EigenBase > +template class SparseSelfAdjointView + : public EigenBase > { public: enum { - Mode = _Mode, + Mode = Mode_, TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0), RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime @@ -57,7 +59,7 @@ template class SparseSelfAdjointView typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix VectorI; typedef typename internal::ref_selector::non_const_type MatrixTypeNested; - typedef typename internal::remove_all::type _MatrixTypeNested; + typedef internal::remove_all_t MatrixTypeNested_; explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix) { @@ -68,8 +70,8 @@ template class SparseSelfAdjointView inline Index cols() const { return m_matrix.cols(); } /** \internal \returns a reference to the nested matrix */ - const _MatrixTypeNested& matrix() const { return m_matrix; } - typename internal::remove_reference::type& matrix() { return m_matrix; } + const MatrixTypeNested_& matrix() const { return m_matrix; } + std::remove_reference_t& matrix() { return m_matrix; } /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs. * @@ -124,9 +126,9 @@ template class SparseSelfAdjointView /** \returns an expression of P H P^-1 */ // TODO implement twists in a more evaluator friendly fashion - SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode> twistedBy(const PermutationMatrix& perm) const + SparseSymmetricPermutationProduct twistedBy(const PermutationMatrix& perm) const { - return SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode>(m_matrix, perm); + return SparseSymmetricPermutationProduct(m_matrix, perm); } template @@ -260,15 +262,6 @@ struct Assignment run(tmp, src, AssignOpType()); dst -= tmp; } - - template - static void run(DynamicSparseMatrix& dst, const SrcXprType &src, const AssignOpType&/*func*/) - { - // TODO directly evaluate into dst; - SparseMatrix tmp(dst.rows(),dst.cols()); - internal::permute_symm_to_fullsymm(src.matrix(), tmp); - dst = tmp; - } }; } // end namespace internal @@ -285,7 +278,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons EIGEN_ONLY_USED_FOR_DEBUG(alpha); typedef typename internal::nested_eval::type SparseLhsTypeNested; - typedef typename internal::remove_all::type SparseLhsTypeNestedCleaned; + typedef internal::remove_all_t SparseLhsTypeNestedCleaned; typedef evaluator LhsEval; typedef typename LhsEval::InnerIterator LhsIterator; typedef typename SparseLhsType::Scalar LhsScalar; @@ -347,7 +340,7 @@ struct generic_product_impl static void scaleAndAddTo(Dest& dst, const LhsView& lhsView, const Rhs& rhs, const typename Dest::Scalar& alpha) { - typedef typename LhsView::_MatrixTypeNested Lhs; + typedef typename LhsView::MatrixTypeNested_ Lhs; typedef typename nested_eval::type LhsNested; typedef typename nested_eval::type RhsNested; LhsNested lhsNested(lhsView.matrix()); @@ -364,7 +357,7 @@ struct generic_product_impl static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const RhsView& rhsView, const typename Dest::Scalar& alpha) { - typedef typename RhsView::_MatrixTypeNested Rhs; + typedef typename RhsView::MatrixTypeNested_ Rhs; typedef typename nested_eval::type LhsNested; typedef typename nested_eval::type RhsNested; LhsNested lhsNested(lhs); @@ -390,7 +383,7 @@ struct product_evaluator, ProductTag, Spar product_evaluator(const XprType& xpr) : m_lhs(xpr.lhs()), m_result(xpr.rows(), xpr.cols()) { - ::new (static_cast(this)) Base(m_result); + internal::construct_at(this, m_result); generic_product_impl::evalTo(m_result, m_lhs, xpr.rhs()); } @@ -516,7 +509,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix +template void permute_symm_to_symm(const MatrixType& mat, SparseMatrix& _dest, const typename MatrixType::StorageIndex* perm) { typedef typename MatrixType::StorageIndex StorageIndex; @@ -529,8 +522,8 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix VectorI; typedef typename MatrixType::Nested MatrixTypeNested; - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm) : m_matrix(mat), m_perm(perm) diff --git a/libs/eigen/Eigen/src/SparseCore/SparseSolverBase.h b/libs/eigen/Eigen/src/SparseCore/SparseSolverBase.h index b4c9a42..8261fb5 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseSolverBase.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseSolverBase.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSESOLVERBASE_H #define EIGEN_SPARSESOLVERBASE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -19,7 +21,7 @@ namespace internal { * The rhs is decomposed into small vertical panels which are solved through dense temporaries. */ template -typename enable_if::type +std::enable_if_t solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest) { EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); @@ -43,7 +45,7 @@ solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest // Overload for vector as rhs template -typename enable_if::type +std::enable_if_t solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest) { typedef typename Dest::Scalar DestScalar; @@ -73,6 +75,8 @@ class SparseSolverBase : internal::noncopyable : m_isInitialized(false) {} + SparseSolverBase(SparseSolverBase&&other ) : internal::noncopyable{}, m_isInitialized{other.m_isInitialized} {} + ~SparseSolverBase() {} diff --git a/libs/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/libs/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 88820a4..ee0ec1b 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H #define EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -21,9 +23,9 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res); - typedef typename remove_all::type::Scalar RhsScalar; - typedef typename remove_all::type::Scalar ResScalar; - typedef typename remove_all::type::StorageIndex StorageIndex; + typedef typename remove_all_t::Scalar RhsScalar; + typedef typename remove_all_t::Scalar ResScalar; + typedef typename remove_all_t::StorageIndex StorageIndex; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); @@ -90,7 +92,7 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); + remove_all_t _res(res.rows(), res.cols()); internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, _res, tolerance); res.swap(_res); } @@ -117,7 +119,7 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); + remove_all_t _res(res.rows(), res.cols()); internal::sparse_sparse_product_with_pruning_impl(rhs, lhs, _res, tolerance); res.swap(_res); } diff --git a/libs/eigen/Eigen/src/SparseCore/SparseTranspose.h b/libs/eigen/Eigen/src/SparseCore/SparseTranspose.h index 3757d4c..cce5903 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseTranspose.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseTranspose.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSETRANSPOSE_H #define EIGEN_SPARSETRANSPOSE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseCore/SparseTriangularView.h b/libs/eigen/Eigen/src/SparseCore/SparseTriangularView.h index 9ac1202..5e7cea7 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseTriangularView.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseTriangularView.h @@ -11,6 +11,8 @@ #ifndef EIGEN_SPARSE_TRIANGULARVIEW_H #define EIGEN_SPARSE_TRIANGULARVIEW_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup SparseCore_Module @@ -44,8 +46,8 @@ template class TriangularViewImpl::type MatrixTypeNestedNonRef; - typedef typename internal::remove_all::type MatrixTypeNestedCleaned; + typedef std::remove_reference_t MatrixTypeNestedNonRef; + typedef internal::remove_all_t MatrixTypeNestedCleaned; template EIGEN_DEVICE_FUNC diff --git a/libs/eigen/Eigen/src/SparseCore/SparseUtil.h b/libs/eigen/Eigen/src/SparseCore/SparseUtil.h index ceb9368..47f5ef6 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseUtil.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseUtil.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEUTIL_H #define EIGEN_SPARSEUTIL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { #ifdef NDEBUG @@ -49,10 +51,8 @@ const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern; const int OuterRandomAccessPattern = 0x4 | CoherentAccessPattern; const int RandomAccessPattern = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern; -template class SparseMatrix; -template class DynamicSparseMatrix; -template class SparseVector; -template class MappedSparseMatrix; +template class SparseMatrix; +template class SparseVector; template class SparseSelfAdjointView; template class SparseDiagonalProduct; @@ -65,10 +65,10 @@ template class SparseDenseOuterProdu template struct SparseSparseProductReturnType; template::ColsAtCompileTime,internal::traits::RowsAtCompileTime)> struct DenseSparseProductReturnType; + int InnerSize = internal::min_size_prefer_fixed(internal::traits::ColsAtCompileTime, internal::traits::RowsAtCompileTime)> struct DenseSparseProductReturnType; template::ColsAtCompileTime,internal::traits::RowsAtCompileTime)> struct SparseDenseProductReturnType; + int InnerSize = internal::min_size_prefer_fixed(internal::traits::ColsAtCompileTime, internal::traits::RowsAtCompileTime)> struct SparseDenseProductReturnType; template class SparseSymmetricPermutationProduct; namespace internal { @@ -80,41 +80,41 @@ template struct eval {}; template struct sparse_eval { - typedef typename traits::Scalar _Scalar; - typedef typename traits::StorageIndex _StorageIndex; + typedef typename traits::Scalar Scalar_; + typedef typename traits::StorageIndex StorageIndex_; public: - typedef SparseVector<_Scalar, RowMajor, _StorageIndex> type; + typedef SparseVector type; }; template struct sparse_eval { - typedef typename traits::Scalar _Scalar; - typedef typename traits::StorageIndex _StorageIndex; + typedef typename traits::Scalar Scalar_; + typedef typename traits::StorageIndex StorageIndex_; public: - typedef SparseVector<_Scalar, ColMajor, _StorageIndex> type; + typedef SparseVector type; }; // TODO this seems almost identical to plain_matrix_type template struct sparse_eval { - typedef typename traits::Scalar _Scalar; - typedef typename traits::StorageIndex _StorageIndex; - enum { _Options = ((Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor }; + typedef typename traits::Scalar Scalar_; + typedef typename traits::StorageIndex StorageIndex_; + enum { Options_ = ((Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor }; public: - typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type; + typedef SparseMatrix type; }; template struct sparse_eval { - typedef typename traits::Scalar _Scalar; + typedef typename traits::Scalar Scalar_; public: - typedef Matrix<_Scalar, 1, 1> type; + typedef Matrix type; }; template struct plain_matrix_type { - typedef typename traits::Scalar _Scalar; - typedef typename traits::StorageIndex _StorageIndex; - enum { _Options = ((evaluator::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor }; + typedef typename traits::Scalar Scalar_; + typedef typename traits::StorageIndex StorageIndex_; + enum { Options_ = ((evaluator::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor }; public: - typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type; + typedef SparseMatrix type; }; template diff --git a/libs/eigen/Eigen/src/SparseCore/SparseVector.h b/libs/eigen/Eigen/src/SparseCore/SparseVector.h index 05779be..3b4d7b0 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseVector.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseVector.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSEVECTOR_H #define EIGEN_SPARSEVECTOR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { /** \ingroup SparseCore_Module @@ -17,7 +19,7 @@ namespace Eigen { * * \brief a sparse vector class * - * \tparam _Scalar the scalar type, i.e. the type of the coefficients + * \tparam Scalar_ the scalar type, i.e. the type of the coefficients * * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme. * @@ -26,21 +28,21 @@ namespace Eigen { */ namespace internal { -template -struct traits > +template +struct traits > { - typedef _Scalar Scalar; - typedef _StorageIndex StorageIndex; + typedef Scalar_ Scalar; + typedef StorageIndex_ StorageIndex; typedef Sparse StorageKind; typedef MatrixXpr XprKind; enum { - IsColVector = (_Options & RowMajorBit) ? 0 : 1, + IsColVector = (Options_ & RowMajorBit) ? 0 : 1, RowsAtCompileTime = IsColVector ? Dynamic : 1, ColsAtCompileTime = IsColVector ? 1 : Dynamic, MaxRowsAtCompileTime = RowsAtCompileTime, MaxColsAtCompileTime = ColsAtCompileTime, - Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit) | CompressedAccessBit, + Flags = Options_ | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit) | CompressedAccessBit, SupportedAccessPatterns = InnerRandomAccessPattern }; }; @@ -60,9 +62,9 @@ struct sparse_vector_assign_selector; } -template +template class SparseVector - : public SparseCompressedBase > + : public SparseCompressedBase > { typedef SparseCompressedBase Base; using Base::convert_index; @@ -75,7 +77,7 @@ class SparseVector enum { IsColVector = internal::traits::IsColVector }; enum { - Options = _Options + Options = Options_ }; EIGEN_STRONG_INLINE Index rows() const { return IsColVector ? m_size : 1; } @@ -207,9 +209,33 @@ class SparseVector inline void finalize() {} /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */ - void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) + Index prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { + return prune([&](const Scalar& val){ return !internal::isMuchSmallerThan(val, reference, epsilon); }); + } + + /** + * \brief Prunes the entries of the vector based on a `predicate` + * \tparam F Type of the predicate. + * \param keep_predicate The predicate that is used to test whether a value should be kept. A callable that + * gets passed om a `Scalar` value and returns a boolean. If the predicate returns true, the value is kept. + * \return The new number of structural non-zeros. + */ + template + Index prune(F&& keep_predicate) { - m_data.prune(reference,epsilon); + Index k = 0; + Index n = m_data.size(); + for (Index i = 0; i < n; ++i) + { + if (keep_predicate(m_data.value(i))) + { + m_data.value(k) = std::move(m_data.value(i)); + m_data.index(k) = m_data.index(i); + ++k; + } + } + m_data.resize(k); + return k; } /** Resizes the sparse vector to \a rows x \a cols @@ -256,11 +282,11 @@ class SparseVector void resizeNonZeros(Index size) { m_data.resize(size); } - inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); } + inline SparseVector() : m_size(0) { resize(0); } - explicit inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); } + explicit inline SparseVector(Index size) : m_size(0) { resize(size); } - inline SparseVector(Index rows, Index cols) : m_size(0) { check_template_parameters(); resize(rows,cols); } + inline SparseVector(Index rows, Index cols) : m_size(0) { resize(rows,cols); } template inline SparseVector(const SparseMatrixBase& other) @@ -269,14 +295,12 @@ class SparseVector #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN #endif - check_template_parameters(); *this = other.derived(); } inline SparseVector(const SparseVector& other) : Base(other), m_size(0) { - check_template_parameters(); *this = other.derived(); } @@ -329,6 +353,7 @@ class SparseVector } #endif +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseVector& m) { for (Index i=0; i::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); - EIGEN_STATIC_ASSERT((_Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS); - } - + EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE) + EIGEN_STATIC_ASSERT((Options_&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS) + Storage m_data; Index m_size; }; namespace internal { -template -struct evaluator > - : evaluator_base > +template +struct evaluator > + : evaluator_base > { - typedef SparseVector<_Scalar,_Options,_Index> SparseVectorType; + typedef SparseVector SparseVectorType; typedef evaluator_base Base; typedef typename SparseVectorType::InnerIterator InnerIterator; typedef typename SparseVectorType::ReverseInnerIterator ReverseInnerIterator; enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, + CoeffReadCost = NumTraits::ReadCost, Flags = SparseVectorType::Flags }; @@ -473,6 +495,78 @@ struct sparse_vector_assign_selector { } +// Specialization for SparseVector. +// Serializes [size, numNonZeros, innerIndices, values]. +template +class Serializer, void> { + public: + typedef SparseVector SparseMat; + + struct Header { + typename SparseMat::Index size; + Index num_non_zeros; + }; + + EIGEN_DEVICE_FUNC size_t size(const SparseMat& value) const { + return sizeof(Header) + + (sizeof(Scalar) + sizeof(StorageIndex)) * value.nonZeros(); + } + + EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, + const SparseMat& value) { + if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr; + + const size_t header_bytes = sizeof(Header); + Header header = {value.innerSize(), value.nonZeros()}; + EIGEN_USING_STD(memcpy) + memcpy(dest, &header, header_bytes); + dest += header_bytes; + + // Inner indices. + std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros; + memcpy(dest, value.innerIndexPtr(), data_bytes); + dest += data_bytes; + + // Values. + data_bytes = sizeof(Scalar) * header.num_non_zeros; + memcpy(dest, value.valuePtr(), data_bytes); + dest += data_bytes; + + return dest; + } + + EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, + const uint8_t* end, + SparseMat& value) const { + if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr; + if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr; + + const size_t header_bytes = sizeof(Header); + Header header; + EIGEN_USING_STD(memcpy) + memcpy(&header, src, header_bytes); + src += header_bytes; + + value.setZero(); + value.resize(header.size); + value.resizeNonZeros(header.num_non_zeros); + + // Inner indices. + std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros; + if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr; + memcpy(value.innerIndexPtr(), src, data_bytes); + src += data_bytes; + + // Values. + data_bytes = sizeof(Scalar) * header.num_non_zeros; + if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr; + memcpy(value.valuePtr(), src, data_bytes); + src += data_bytes; + return src; + } +}; + } // end namespace Eigen #endif // EIGEN_SPARSEVECTOR_H diff --git a/libs/eigen/Eigen/src/SparseCore/SparseView.h b/libs/eigen/Eigen/src/SparseCore/SparseView.h index 92b3d1f..dbb4c43 100644 --- a/libs/eigen/Eigen/src/SparseCore/SparseView.h +++ b/libs/eigen/Eigen/src/SparseCore/SparseView.h @@ -11,6 +11,8 @@ #ifndef EIGEN_SPARSEVIEW_H #define EIGEN_SPARSEVIEW_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -45,11 +47,11 @@ template class SparseView : public SparseMatrixBase > { typedef typename MatrixType::Nested MatrixTypeNested; - typedef typename internal::remove_all::type _MatrixTypeNested; + typedef internal::remove_all_t MatrixTypeNested_; typedef SparseMatrixBase Base; public: EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView) - typedef typename internal::remove_all::type NestedExpression; + typedef internal::remove_all_t NestedExpression; explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0), const RealScalar &epsilon = NumTraits::dummy_precision()) @@ -62,7 +64,7 @@ public: inline Index outerSize() const { return m_matrix.outerSize(); } /** \returns the nested expression */ - const typename internal::remove_all::type& + const internal::remove_all_t& nestedExpression() const { return m_matrix; } Scalar reference() const { return m_reference; } diff --git a/libs/eigen/Eigen/src/SparseCore/TriangularSolver.h b/libs/eigen/Eigen/src/SparseCore/TriangularSolver.h index f9c56ba..a9fbeeb 100644 --- a/libs/eigen/Eigen/src/SparseCore/TriangularSolver.h +++ b/libs/eigen/Eigen/src/SparseCore/TriangularSolver.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSETRIANGULARSOLVER_H #define EIGEN_SPARSETRIANGULARSOLVER_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -114,7 +116,7 @@ struct sparse_solve_triangular_selector for(Index i=0; i for(Index i=lhs.cols()-1; i>=0; --i) { Scalar& tmp = other.coeffRef(i,col); - if (tmp!=Scalar(0)) // optimization when other is actually sparse + if (!numext::is_exactly_zero(tmp)) // optimization when other is actually sparse { if(!(Mode & UnitDiag)) { @@ -182,11 +184,11 @@ void TriangularViewImpl::solveInPlace(MatrixBase::Flags & RowMajorBit }; - typedef typename internal::conditional::type, OtherDerived&>::type OtherCopy; + typedef std::conditional_t::type, OtherDerived&> OtherCopy; OtherCopy otherCopy(other.derived()); - internal::sparse_solve_triangular_selector::type, Mode>::run(derived().nestedExpression(), otherCopy); + internal::sparse_solve_triangular_selector, Mode>::run(derived().nestedExpression(), otherCopy); if (copy) other = otherCopy; @@ -239,7 +241,7 @@ struct sparse_solve_triangular_sparse_selector { tempVector.restart(); Scalar& ci = tempVector.coeffRef(i); - if (ci!=Scalar(0)) + if (!numext::is_exactly_zero(ci)) { // find typename Lhs::InnerIterator it(lhs, i); @@ -270,11 +272,11 @@ struct sparse_solve_triangular_sparse_selector } - Index count = 0; +// Index count = 0; // FIXME compute a reference value to filter zeros for (typename AmbiVector::Iterator it(tempVector/*,1e-12*/); it; ++it) { - ++ count; +// ++ count; // std::cerr << "fill " << it.index() << ", " << col << "\n"; // std::cout << it.value() << " "; // FIXME use insertBack @@ -299,8 +301,8 @@ void TriangularViewImpl::solveInPlace(SparseMatrixBa // enum { copy = internal::traits::Flags & RowMajorBit }; -// typedef typename internal::conditional::type, OtherDerived&>::type OtherCopy; +// typedef std::conditional_t::type, OtherDerived&> OtherCopy; // OtherCopy otherCopy(other.derived()); internal::sparse_solve_triangular_sparse_selector::run(derived().nestedExpression(), other.derived()); diff --git a/libs/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h b/libs/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h new file mode 100644 index 0000000..78ebfcc --- /dev/null +++ b/libs/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_SPARSELU_MODULE_H +#error "Please include Eigen/SparseLU instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU.h b/libs/eigen/Eigen/src/SparseLU/SparseLU.h index 0c8d893..1e69924 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU.h @@ -12,9 +12,11 @@ #ifndef EIGEN_SPARSE_LU_H #define EIGEN_SPARSE_LU_H +#include "./InternalHeaderCheck.h" + namespace Eigen { -template > class SparseLU; +template > class SparseLU; template struct SparseLUMatrixLReturnType; template struct SparseLUMatrixUReturnType; @@ -35,8 +37,8 @@ public: MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - SparseLUTransposeView() : m_sparseLU(NULL) {} - SparseLUTransposeView(const SparseLUTransposeView& view) { + SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {} + SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() { this->m_sparseLU = view.m_sparseLU; } void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;} @@ -119,25 +121,25 @@ private: * If this is the case for your matrices, you can try the basic scaling method at * "unsupported/Eigen/src/IterativeSolvers/Scaling.h" * - * \tparam _MatrixType The type of the sparse matrix. It must be a column-major SparseMatrix<> - * \tparam _OrderingType The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD + * \tparam MatrixType_ The type of the sparse matrix. It must be a column-major SparseMatrix<> + * \tparam OrderingType_ The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD * * \implsparsesolverconcept * * \sa \ref TutorialSparseSolverConcept * \sa \ref OrderingMethods_Module */ -template -class SparseLU : public SparseSolverBase >, public internal::SparseLUImpl +template +class SparseLU : public SparseSolverBase >, public internal::SparseLUImpl { protected: - typedef SparseSolverBase > APIBase; + typedef SparseSolverBase > APIBase; using APIBase::m_isInitialized; public: using APIBase::_solve_impl; - typedef _MatrixType MatrixType; - typedef _OrderingType OrderingType; + typedef MatrixType_ MatrixType; + typedef OrderingType_ OrderingType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; @@ -197,9 +199,9 @@ class SparseLU : public SparseSolverBase >, * * \sa adjoint(), solve() */ - const SparseLUTransposeView > transpose() + const SparseLUTransposeView > transpose() { - SparseLUTransposeView > transposeView; + SparseLUTransposeView > transposeView; transposeView.setSparseLU(this); transposeView.setIsInitialized(this->m_isInitialized); return transposeView; @@ -218,9 +220,9 @@ class SparseLU : public SparseSolverBase >, * * \sa transpose(), solve() */ - const SparseLUTransposeView > adjoint() + const SparseLUTransposeView > adjoint() { - SparseLUTransposeView > adjointView; + SparseLUTransposeView > adjointView; adjointView.setSparseLU(this); adjointView.setIsInitialized(this->m_isInitialized); return adjointView; @@ -250,9 +252,9 @@ class SparseLU : public SparseSolverBase >, * y = b; matrixU().solveInPlace(y); * \endcode */ - SparseLUMatrixUReturnType > matrixU() const + SparseLUMatrixUReturnType > > matrixU() const { - return SparseLUMatrixUReturnType >(m_Lstore, m_Ustore); + return SparseLUMatrixUReturnType > >(m_Lstore, m_Ustore); } /** @@ -452,8 +454,8 @@ class SparseLU : public SparseSolverBase >, return (m_detPermR * m_detPermC) > 0 ? det : -det; } - Index nnzL() const { return m_nnzL; }; - Index nnzU() const { return m_nnzU; }; + Index nnzL() const { return m_nnzL; } + Index nnzU() const { return m_nnzU; } protected: // Functions @@ -474,7 +476,7 @@ class SparseLU : public SparseSolverBase >, std::string m_lastError; NCMatrix m_mat; // The input (permuted ) matrix SCMatrix m_Lstore; // The lower triangular matrix (supernodal) - MappedSparseMatrix m_Ustore; // The upper triangular matrix + Map> m_Ustore; // The upper triangular matrix PermutationType m_perm_c; // Column permutation PermutationType m_perm_r ; // Row permutation IndexVector m_etree; // Column elimination tree @@ -752,10 +754,13 @@ void SparseLU::factorize(const MatrixType& matrix) info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu); if ( info ) { - m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT "; + m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR"; +#ifndef EIGEN_NO_IO std::ostringstream returnInfo; - returnInfo << info; + returnInfo << " ... ZERO COLUMN AT "; + returnInfo << info; m_lastError += returnInfo.str(); +#endif m_info = NumericalIssue; m_factorizationIsOk = false; return; @@ -789,7 +794,7 @@ void SparseLU::factorize(const MatrixType& matrix) // Create supernode matrix L m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup); // Create the column major upper sparse matrix U; - new (&m_Ustore) MappedSparseMatrix ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() ); + new (&m_Ustore) Map> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() ); m_info = Success; m_factorizationIsOk = true; @@ -814,6 +819,31 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator m_mapL.template solveTransposedInPlace(X); } + SparseMatrix toSparse() const { + ArrayXi colCount = ArrayXi::Ones(cols()); + for (Index i = 0; i < cols(); i++) { + typename MappedSupernodalType::InnerIterator iter(m_mapL, i); + for (; iter; ++iter) { + if (iter.row() > iter.col()) { + colCount(iter.col())++; + } + } + } + SparseMatrix sL(rows(), cols()); + sL.reserve(colCount); + for (Index i = 0; i < cols(); i++) { + sL.insert(i, i) = 1.0; + typename MappedSupernodalType::InnerIterator iter(m_mapL, i); + for (; iter; ++iter) { + if (iter.row() > iter.col()) { + sL.insert(iter.row(), iter.col()) = iter.value(); + } + } + } + sL.makeCompressed(); + return sL; + } + const MappedSupernodalType& m_mapL; }; @@ -830,7 +860,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator template void solveInPlace(MatrixBase &X) const { Index nrhs = X.cols(); - Index n = X.rows(); // Backward solve with U for (Index k = m_mapL.nsuper(); k >= 0; k--) { @@ -850,7 +879,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); U = A.template triangularView().solve(U); } @@ -873,7 +902,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { using numext::conj; Index nrhs = X.cols(); - Index n = X.rows(); // Forward solve with U for (Index k = 0; k <= m_mapL.nsuper(); k++) { @@ -904,7 +932,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator else { Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = A.adjoint().template triangularView().solve(U); else @@ -913,6 +941,32 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator }// End For U-solve } + SparseMatrix toSparse() { + ArrayXi rowCount = ArrayXi::Zero(rows()); + for (Index i = 0; i < cols(); i++) { + typename MatrixLType::InnerIterator iter(m_mapL, i); + for (; iter; ++iter) { + if (iter.row() <= iter.col()) { + rowCount(iter.row())++; + } + } + } + + SparseMatrix sU(rows(), cols()); + sU.reserve(rowCount); + for (Index i = 0; i < cols(); i++) { + typename MatrixLType::InnerIterator iter(m_mapL, i); + for (; iter; ++iter) { + if (iter.row() <= iter.col()) { + sU.insert(iter.row(), iter.col()) = iter.value(); + } + } + } + sU.makeCompressed(); + const SparseMatrix u = m_mapU; // convert to RowMajor + sU += u; + return sU; + } const MatrixLType& m_mapL; const MatrixUType& m_mapU; diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLUImpl.h b/libs/eigen/Eigen/src/SparseLU/SparseLUImpl.h index fc0cfc4..daec837 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLUImpl.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLUImpl.h @@ -9,6 +9,8 @@ #ifndef SPARSELU_IMPL_H #define SPARSELU_IMPL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_Memory.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_Memory.h index 349bfd5..798745f 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_Memory.h @@ -31,6 +31,8 @@ #ifndef EIGEN_SPARSELU_MEMORY #define EIGEN_SPARSELU_MEMORY +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_Structs.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_Structs.h index cf5ec44..3ab0c72 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_Structs.h @@ -68,10 +68,12 @@ #ifndef EIGEN_LU_STRUCTS #define EIGEN_LU_STRUCTS +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { - -typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; + +enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL}; template struct LU_GlobalLU_t { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 0be293d..adfc63a 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -11,6 +11,8 @@ #ifndef EIGEN_SPARSELU_SUPERNODAL_MATRIX_H #define EIGEN_SPARSELU_SUPERNODAL_MATRIX_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -29,12 +31,12 @@ namespace internal { * SuperInnerIterator to iterate through all supernodes * Function for triangular solve */ -template +template class MappedSuperNodalMatrix { public: - typedef _Scalar Scalar; - typedef _StorageIndex StorageIndex; + typedef Scalar_ Scalar; + typedef StorageIndex_ StorageIndex; typedef Matrix IndexVector; typedef Matrix ScalarVector; public: @@ -274,9 +276,8 @@ void MappedSuperNodalMatrix::solveInPlace( MatrixBase&X) co // Triangular solve Map, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); - U = A.template triangularView().solve(U); - + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); + U = A.template triangularView().solve(U); // Matrix-vector product new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); work.topRows(nrow).noalias() = A * U; @@ -349,7 +350,7 @@ void MappedSuperNodalMatrix::solveTransposedInPlace( MatrixBase, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = U - A.adjoint() * work.topRows(nrow); else diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_Utils.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_Utils.h index 9e3dab4..e399fed 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_Utils.h @@ -11,6 +11,8 @@ #ifndef EIGEN_SPARSELU_UTILS_H #define EIGEN_SPARSELU_UTILS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h index b57f068..d5c29b3 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h @@ -31,6 +31,8 @@ #ifndef SPARSELU_COLUMN_BMOD_H #define SPARSELU_COLUMN_BMOD_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h index 5a2c941..be4cfd1 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h @@ -31,6 +31,8 @@ #define SPARSELU_COLUMN_DFS_H template class SparseLUImpl; +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h index c32d8d8..e06b2a0 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h @@ -29,6 +29,8 @@ #ifndef SPARSELU_COPY_TO_UCOL_H #define SPARSELU_COPY_TO_UCOL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index e37c2fe..034d379 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SPARSELU_GEMM_KERNEL_H #define EIGEN_SPARSELU_GEMM_KERNEL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h index 6f75d50..2a8d80b 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h @@ -28,6 +28,8 @@ #ifndef SPARSELU_HEAP_RELAX_SNODE_H #define SPARSELU_HEAP_RELAX_SNODE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -75,8 +77,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe // Identify the relaxed supernodes by postorder traversal of the etree Index snode_start; // beginning of a snode StorageIndex k; - Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree - Index nsuper_et = 0; // Number of relaxed snodes in the original etree StorageIndex l; for (j = 0; j < n; ) { @@ -88,7 +88,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe parent = et(j); } // Found a supernode in postordered etree, j is the last column - ++nsuper_et_post; k = StorageIndex(n); for (Index i = snode_start; i <= j; ++i) k = (std::min)(k, inv_post(i)); @@ -97,7 +96,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe { // This is also a supernode in the original etree relax_end(k) = l; // Record last column - ++nsuper_et; } else { @@ -107,7 +105,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe if (descendants(i) == 0) { relax_end(l) = l; - ++nsuper_et; } } } diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index 8c1b3e8..424f93c 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -11,6 +11,8 @@ #ifndef SPARSELU_KERNEL_BMOD_H #define SPARSELU_KERNEL_BMOD_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h index f052001..8cd331a 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h @@ -31,6 +31,8 @@ #ifndef SPARSELU_PANEL_BMOD_H #define SPARSELU_PANEL_BMOD_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h index 155df73..c3ff013 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h @@ -30,6 +30,8 @@ #ifndef SPARSELU_PANEL_DFS_H #define SPARSELU_PANEL_DFS_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h index a86dac9..6daed91 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h @@ -30,6 +30,8 @@ #ifndef SPARSELU_PIVOTL_H #define SPARSELU_PIVOTL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h index ad32fed..e5da73b 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h @@ -30,6 +30,8 @@ #ifndef SPARSELU_PRUNEL_H #define SPARSELU_PRUNEL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h b/libs/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h index c408d01..ed79532 100644 --- a/libs/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +++ b/libs/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h @@ -28,6 +28,8 @@ #ifndef SPARSELU_RELAX_SNODE_H #define SPARSELU_RELAX_SNODE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h b/libs/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h new file mode 100644 index 0000000..0564e93 --- /dev/null +++ b/libs/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_SPARSEQR_MODULE_H +#error "Please include Eigen/SparseQR instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SparseQR/SparseQR.h b/libs/eigen/Eigen/src/SparseQR/SparseQR.h index d1fb96f..f825092 100644 --- a/libs/eigen/Eigen/src/SparseQR/SparseQR.h +++ b/libs/eigen/Eigen/src/SparseQR/SparseQR.h @@ -11,6 +11,8 @@ #ifndef EIGEN_SPARSE_QR_H #define EIGEN_SPARSE_QR_H +#include "./InternalHeaderCheck.h" + namespace Eigen { template class SparseQR; @@ -59,8 +61,8 @@ namespace internal { * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient. * matrixR().topLeftCorner(rank(), rank()) always returns a triangular factor of full rank. * - * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<> - * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module + * \tparam MatrixType_ The type of the sparse matrix A, must be a column-major SparseMatrix<> + * \tparam OrderingType_ The fill-reducing ordering method. See the \link OrderingMethods_Module * OrderingMethods \endlink module for the list of built-in and external ordering methods. * * \implsparsesolverconcept @@ -80,16 +82,16 @@ namespace internal { * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * */ -template -class SparseQR : public SparseSolverBase > +template +class SparseQR : public SparseSolverBase > { protected: - typedef SparseSolverBase > Base; + typedef SparseSolverBase > Base; using Base::m_isInitialized; public: using Base::_solve_impl; - typedef _MatrixType MatrixType; - typedef _OrderingType OrderingType; + typedef MatrixType_ MatrixType; + typedef OrderingType_ OrderingType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; @@ -321,7 +323,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) { eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR"); // Copy to a column major matrix if the input is rowmajor - typename internal::conditional::type matCpy(mat); + std::conditional_t matCpy(mat); // Compute the column fill reducing ordering OrderingType ord; ord(matCpy, m_perm_c); diff --git a/libs/eigen/Eigen/src/StlSupport/StdDeque.h b/libs/eigen/Eigen/src/StlSupport/StdDeque.h index 6d47e75..1e95182 100644 --- a/libs/eigen/Eigen/src/StlSupport/StdDeque.h +++ b/libs/eigen/Eigen/src/StlSupport/StdDeque.h @@ -11,6 +11,10 @@ #ifndef EIGEN_STDDEQUE_H #define EIGEN_STDDEQUE_H +#ifndef EIGEN_STDDEQUE_MODULE_H +#error "Please include Eigen/StdDeque instead of including this file directly." +#endif + #include "details.h" /** @@ -44,73 +48,4 @@ namespace std \ }; \ } -// check whether we really need the std::deque specialization -#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */ - -namespace std { - -#define EIGEN_STD_DEQUE_SPECIALIZATION_BODY \ - public: \ - typedef T value_type; \ - typedef typename deque_base::allocator_type allocator_type; \ - typedef typename deque_base::size_type size_type; \ - typedef typename deque_base::iterator iterator; \ - typedef typename deque_base::const_iterator const_iterator; \ - explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {} \ - template \ - deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \ - : deque_base(first, last, a) {} \ - deque(const deque& c) : deque_base(c) {} \ - explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ - deque& operator=(const deque& x) { \ - deque_base::operator=(x); \ - return *this; \ - } - - template - class deque > - : public deque > -{ - typedef deque > deque_base; - EIGEN_STD_DEQUE_SPECIALIZATION_BODY - - void resize(size_type new_size) - { resize(new_size, T()); } - -#if defined(_DEQUE_) - // workaround MSVC std::deque implementation - void resize(size_type new_size, const value_type& x) - { - if (deque_base::size() < new_size) - deque_base::_Insert_n(deque_base::end(), new_size - deque_base::size(), x); - else if (new_size < deque_base::size()) - deque_base::erase(deque_base::begin() + new_size, deque_base::end()); - } - void push_back(const value_type& x) - { deque_base::push_back(x); } - void push_front(const value_type& x) - { deque_base::push_front(x); } - using deque_base::insert; - iterator insert(const_iterator position, const value_type& x) - { return deque_base::insert(position,x); } - void insert(const_iterator position, size_type new_size, const value_type& x) - { deque_base::insert(position, new_size, x); } -#else - // default implementation which should always work. - void resize(size_type new_size, const value_type& x) - { - if (new_size < deque_base::size()) - deque_base::erase(deque_base::begin() + new_size, deque_base::end()); - else if (new_size > deque_base::size()) - deque_base::insert(deque_base::end(), new_size - deque_base::size(), x); - } -#endif - }; -} - -#endif // check whether specialization is actually required - #endif // EIGEN_STDDEQUE_H diff --git a/libs/eigen/Eigen/src/StlSupport/StdList.h b/libs/eigen/Eigen/src/StlSupport/StdList.h index 8ba3fad..da36677 100644 --- a/libs/eigen/Eigen/src/StlSupport/StdList.h +++ b/libs/eigen/Eigen/src/StlSupport/StdList.h @@ -10,6 +10,10 @@ #ifndef EIGEN_STDLIST_H #define EIGEN_STDLIST_H +#ifndef EIGEN_STDLIST_MODULE_H +#error "Please include Eigen/StdList instead of including this file directly." +#endif + #include "details.h" /** @@ -43,64 +47,4 @@ namespace std \ }; \ } -// check whether we really need the std::list specialization -#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_LIST) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */ - -namespace std -{ - -#define EIGEN_STD_LIST_SPECIALIZATION_BODY \ - public: \ - typedef T value_type; \ - typedef typename list_base::allocator_type allocator_type; \ - typedef typename list_base::size_type size_type; \ - typedef typename list_base::iterator iterator; \ - typedef typename list_base::const_iterator const_iterator; \ - explicit list(const allocator_type& a = allocator_type()) : list_base(a) {} \ - template \ - list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \ - : list_base(first, last, a) {} \ - list(const list& c) : list_base(c) {} \ - explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start_, iterator end_) : list_base(start_, end_) {} \ - list& operator=(const list& x) { \ - list_base::operator=(x); \ - return *this; \ - } - - template - class list > - : public list > - { - typedef list > list_base; - EIGEN_STD_LIST_SPECIALIZATION_BODY - - void resize(size_type new_size) - { resize(new_size, T()); } - - void resize(size_type new_size, const value_type& x) - { - if (list_base::size() < new_size) - list_base::insert(list_base::end(), new_size - list_base::size(), x); - else - while (new_size < list_base::size()) list_base::pop_back(); - } - -#if defined(_LIST_) - // workaround MSVC std::list implementation - void push_back(const value_type& x) - { list_base::push_back(x); } - using list_base::insert; - iterator insert(const_iterator position, const value_type& x) - { return list_base::insert(position,x); } - void insert(const_iterator position, size_type new_size, const value_type& x) - { list_base::insert(position, new_size, x); } -#endif - }; -} - -#endif // check whether specialization is actually required - #endif // EIGEN_STDLIST_H diff --git a/libs/eigen/Eigen/src/StlSupport/StdVector.h b/libs/eigen/Eigen/src/StlSupport/StdVector.h index 9fcf19b..02dfb39 100644 --- a/libs/eigen/Eigen/src/StlSupport/StdVector.h +++ b/libs/eigen/Eigen/src/StlSupport/StdVector.h @@ -11,6 +11,10 @@ #ifndef EIGEN_STDVECTOR_H #define EIGEN_STDVECTOR_H +#ifndef EIGEN_STDVECTOR_MODULE_H +#error "Please include Eigen/StdVector instead of including this file directly." +#endif + #include "details.h" /** @@ -44,88 +48,4 @@ namespace std \ }; \ } -// Don't specialize if containers are implemented according to C++11 -#if !EIGEN_HAS_CXX11_CONTAINERS - -namespace std { - -#define EIGEN_STD_VECTOR_SPECIALIZATION_BODY \ - public: \ - typedef T value_type; \ - typedef typename vector_base::allocator_type allocator_type; \ - typedef typename vector_base::size_type size_type; \ - typedef typename vector_base::iterator iterator; \ - typedef typename vector_base::const_iterator const_iterator; \ - explicit vector(const allocator_type& a = allocator_type()) : vector_base(a) {} \ - template \ - vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \ - : vector_base(first, last, a) {} \ - vector(const vector& c) : vector_base(c) {} \ - explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ - vector& operator=(const vector& x) { \ - vector_base::operator=(x); \ - return *this; \ - } - - template - class vector > - : public vector > -{ - typedef vector > vector_base; - EIGEN_STD_VECTOR_SPECIALIZATION_BODY - - void resize(size_type new_size) - { resize(new_size, T()); } - -#if defined(_VECTOR_) - // workaround MSVC std::vector implementation - void resize(size_type new_size, const value_type& x) - { - if (vector_base::size() < new_size) - vector_base::_Insert_n(vector_base::end(), new_size - vector_base::size(), x); - else if (new_size < vector_base::size()) - vector_base::erase(vector_base::begin() + new_size, vector_base::end()); - } - void push_back(const value_type& x) - { vector_base::push_back(x); } - using vector_base::insert; - iterator insert(const_iterator position, const value_type& x) - { return vector_base::insert(position,x); } - void insert(const_iterator position, size_type new_size, const value_type& x) - { vector_base::insert(position, new_size, x); } -#elif defined(_GLIBCXX_VECTOR) && (!(EIGEN_GNUC_AT_LEAST(4,1))) - /* Note that before gcc-4.1 we already have: std::vector::resize(size_type,const T&). - * However, this specialization is still needed to make the above EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION trick to work. */ - void resize(size_type new_size, const value_type& x) - { - vector_base::resize(new_size,x); - } -#elif defined(_GLIBCXX_VECTOR) && EIGEN_GNUC_AT_LEAST(4,2) - // workaround GCC std::vector implementation - void resize(size_type new_size, const value_type& x) - { - if (new_size < vector_base::size()) - vector_base::_M_erase_at_end(this->_M_impl._M_start + new_size); - else - vector_base::insert(vector_base::end(), new_size - vector_base::size(), x); - } -#else - // either GCC 4.1 or non-GCC - // default implementation which should always work. - void resize(size_type new_size, const value_type& x) - { - if (new_size < vector_base::size()) - vector_base::erase(vector_base::begin() + new_size, vector_base::end()); - else if (new_size > vector_base::size()) - vector_base::insert(vector_base::end(), new_size - vector_base::size(), x); - } -#endif - }; -} -#endif // !EIGEN_HAS_CXX11_CONTAINERS - - #endif // EIGEN_STDVECTOR_H diff --git a/libs/eigen/Eigen/src/StlSupport/details.h b/libs/eigen/Eigen/src/StlSupport/details.h index 2cfd13e..29fd871 100644 --- a/libs/eigen/Eigen/src/StlSupport/details.h +++ b/libs/eigen/Eigen/src/StlSupport/details.h @@ -52,11 +52,7 @@ namespace Eigen { // in std::vector::resize(size_t s,T x) won't be aligned and generate an error // even if this function is never called. Whence this little wrapper. #define EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T) \ - typename Eigen::internal::conditional< \ - Eigen::internal::is_arithmetic::value, \ - T, \ - Eigen::internal::workaround_msvc_stl_support \ - >::type + std::conditional_t::value, T, Eigen::internal::workaround_msvc_stl_support > namespace internal { template struct workaround_msvc_stl_support : public T diff --git a/libs/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h b/libs/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h new file mode 100644 index 0000000..94a62b5 --- /dev/null +++ b/libs/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_SUPERLUSUPPORT_MODULE_H +#error "Please include Eigen/SuperLUSupport instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h b/libs/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h index d1d3ad7..4bac22d 100644 --- a/libs/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/libs/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -10,6 +10,8 @@ #ifndef EIGEN_SUPERLUSUPPORT_H #define EIGEN_SUPERLUSUPPORT_H +#include "./InternalHeaderCheck.h" + namespace Eigen { #if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5) @@ -295,14 +297,14 @@ SluMatrix asSluMatrix(MatrixType& mat) /** View a Super LU matrix as an Eigen expression */ template -MappedSparseMatrix map_superlu(SluMatrix& sluMat) +Map > map_superlu(SluMatrix& sluMat) { eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR) || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC)); Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow; - return MappedSparseMatrix( + return Map >( sluMat.nrow, sluMat.ncol, sluMat.storage.outerInd[outerSize], sluMat.storage.outerInd, sluMat.storage.innerInd, reinterpret_cast(sluMat.storage.values) ); } @@ -313,7 +315,7 @@ MappedSparseMatrix map_superlu(SluMatrix& sluMat) * \class SuperLUBase * \brief The base class for the direct and incomplete LU factorization of SuperLU */ -template +template class SuperLUBase : public SparseSolverBase { protected: @@ -321,7 +323,7 @@ class SuperLUBase : public SparseSolverBase using Base::derived; using Base::m_isInitialized; public: - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; @@ -476,7 +478,7 @@ class SuperLUBase : public SparseSolverBase * using the SuperLU library. The sparse matrix A must be squared and invertible. The vectors or matrices * X and B can be either dense or sparse. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> * * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported. * @@ -484,12 +486,12 @@ class SuperLUBase : public SparseSolverBase * * \sa \ref TutorialSparseSolverConcept, class SparseLU */ -template -class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> > +template +class SuperLU : public SuperLUBase > { public: - typedef SuperLUBase<_MatrixType,SuperLU> Base; - typedef _MatrixType MatrixType; + typedef SuperLUBase Base; + typedef MatrixType_ MatrixType; typedef typename Base::Scalar Scalar; typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; @@ -830,19 +832,19 @@ typename SuperLU::Scalar SuperLU::determinant() const * * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported. * - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> * * \implsparsesolverconcept * * \sa \ref TutorialSparseSolverConcept, class IncompleteLUT, class ConjugateGradient, class BiCGSTAB */ -template -class SuperILU : public SuperLUBase<_MatrixType,SuperILU<_MatrixType> > +template +class SuperILU : public SuperLUBase > { public: - typedef SuperLUBase<_MatrixType,SuperILU> Base; - typedef _MatrixType MatrixType; + typedef SuperLUBase Base; + typedef MatrixType_ MatrixType; typedef typename Base::Scalar Scalar; typedef typename Base::RealScalar RealScalar; diff --git a/libs/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h b/libs/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h new file mode 100644 index 0000000..64112f1 --- /dev/null +++ b/libs/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_UMFPACKSUPPORT_MODULE_H +#error "Please include Eigen/UmfPackSupport instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h b/libs/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h index e3a333f..d9a8d38 100644 --- a/libs/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/libs/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -20,6 +20,8 @@ #endif #endif +#include "./InternalHeaderCheck.h" + namespace Eigen { /* TODO extract L, extract U, compute det, etc... */ @@ -278,21 +280,21 @@ inline SuiteSparse_long umfpack_get_determinant(std::complex *Mx, double * * \warning The input matrix A should be in a \b compressed and \b column-major form. * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> * * \implsparsesolverconcept * * \sa \ref TutorialSparseSolverConcept, class SparseLU */ -template -class UmfPackLU : public SparseSolverBase > +template +class UmfPackLU : public SparseSolverBase > { protected: - typedef SparseSolverBase > Base; + typedef SparseSolverBase > Base; using Base::m_isInitialized; public: using Base::_solve_impl; - typedef _MatrixType MatrixType; + typedef MatrixType_ MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; @@ -529,16 +531,16 @@ class UmfPackLU : public SparseSolverBase > template void grab(const EigenBase &A) { - mp_matrix.~UmfpackMatrixRef(); - ::new (&mp_matrix) UmfpackMatrixRef(A.derived()); + internal::destroy_at(&mp_matrix); + internal::construct_at(&mp_matrix, A.derived()); } void grab(const UmfpackMatrixRef &A) { if(&(A.derived()) != &mp_matrix) { - mp_matrix.~UmfpackMatrixRef(); - ::new (&mp_matrix) UmfpackMatrixRef(A); + internal::destroy_at(&mp_matrix); + internal::construct_at(&mp_matrix, A); } } diff --git a/libs/eigen/Eigen/src/misc/Image.h b/libs/eigen/Eigen/src/misc/Image.h index b8b8a04..b500036 100644 --- a/libs/eigen/Eigen/src/misc/Image.h +++ b/libs/eigen/Eigen/src/misc/Image.h @@ -10,6 +10,8 @@ #ifndef EIGEN_MISC_IMAGE_H #define EIGEN_MISC_IMAGE_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -32,10 +34,10 @@ struct traits > > ReturnType; }; -template struct image_retval_base - : public ReturnByValue > +template struct image_retval_base + : public ReturnByValue > { - typedef _DecompositionType DecompositionType; + typedef DecompositionType_ DecompositionType; typedef typename DecompositionType::MatrixType MatrixType; typedef ReturnByValue Base; diff --git a/libs/eigen/Eigen/src/misc/InternalHeaderCheck.h b/libs/eigen/Eigen/src/misc/InternalHeaderCheck.h new file mode 100644 index 0000000..1cea572 --- /dev/null +++ b/libs/eigen/Eigen/src/misc/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_CORE_MODULE_H +#error "Please include Eigen/Core instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/misc/Kernel.h b/libs/eigen/Eigen/src/misc/Kernel.h index bef5d6f..7abfbb7 100644 --- a/libs/eigen/Eigen/src/misc/Kernel.h +++ b/libs/eigen/Eigen/src/misc/Kernel.h @@ -10,6 +10,8 @@ #ifndef EIGEN_MISC_KERNEL_H #define EIGEN_MISC_KERNEL_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { @@ -34,10 +36,10 @@ struct traits > > ReturnType; }; -template struct kernel_retval_base - : public ReturnByValue > +template struct kernel_retval_base + : public ReturnByValue > { - typedef _DecompositionType DecompositionType; + typedef DecompositionType_ DecompositionType; typedef ReturnByValue Base; explicit kernel_retval_base(const DecompositionType& dec) diff --git a/libs/eigen/Eigen/src/misc/RealSvd2x2.h b/libs/eigen/Eigen/src/misc/RealSvd2x2.h index abb4d3c..5dd75f3 100644 --- a/libs/eigen/Eigen/src/misc/RealSvd2x2.h +++ b/libs/eigen/Eigen/src/misc/RealSvd2x2.h @@ -11,6 +11,8 @@ #ifndef EIGEN_REALSVD2X2_H #define EIGEN_REALSVD2X2_H +#include "./InternalHeaderCheck.h" + namespace Eigen { namespace internal { diff --git a/libs/eigen/Eigen/src/misc/lapacke.h b/libs/eigen/Eigen/src/misc/lapacke.h old mode 100755 new mode 100644 diff --git a/libs/eigen/Eigen/src/misc/lapacke_helpers.h b/libs/eigen/Eigen/src/misc/lapacke_helpers.h new file mode 100644 index 0000000..b6ad6e8 --- /dev/null +++ b/libs/eigen/Eigen/src/misc/lapacke_helpers.h @@ -0,0 +1,160 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 Erik Schultheis +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_LAPACKE_HELPERS_H +#define EIGEN_LAPACKE_HELPERS_H + +#include "./InternalHeaderCheck.h" + +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else +#include "lapacke.h" +#endif + +namespace Eigen { +namespace internal { +/** + * \internal + * \brief Implementation details and helper functions for the lapacke glue code. + */ +namespace lapacke_helpers { + +// --------------------------------------------------------------------------------------------------------------------- +// Translation from Eigen to Lapacke for types and constants +// --------------------------------------------------------------------------------------------------------------------- + +// For complex numbers, the types in Eigen and Lapacke are different, but layout compatible. +template +struct translate_type_imp; +template<> +struct translate_type_imp { + using type = float; +}; +template<> +struct translate_type_imp { + using type = double; +}; +template<> +struct translate_type_imp> { + using type = lapack_complex_double; +}; +template<> +struct translate_type_imp> { + using type = lapack_complex_float; +}; + +/// Given an Eigen types, this is defined to be the corresponding, layout-compatible lapack type +template +using translated_type = typename translate_type_imp::type; + +/// These functions convert their arguments from Eigen to Lapack types +/// This function performs conversion for any of the translations defined above. +template> +EIGEN_ALWAYS_INLINE auto to_lapack(Source value) { return static_cast(value); } + +/// This function performs conversions for pointer types corresponding to the translations abovce. +/// This is valid because the translations are between layout-compatible types. +template> +EIGEN_ALWAYS_INLINE auto to_lapack(Source *value) { return reinterpret_cast(value); } + +/// This function converts the Eigen Index to a lapack index, with possible range checks +/// \sa internal::convert_index +EIGEN_ALWAYS_INLINE lapack_int to_lapack(Index index) { + return convert_index(index); +} + +/// translates storage order of the given Eigen object to the corresponding lapack constant +template +EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR lapack_int lapack_storage_of(const EigenBase &) { + return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; +} + +/// translate UpLo type to the corresponding letter code +template char translate_mode; +template<> constexpr char translate_mode = 'L'; +template<> constexpr char translate_mode = 'U'; + + +// --------------------------------------------------------------------------------------------------------------------- +// Automatic generation of low-level wrappers +// --------------------------------------------------------------------------------------------------------------------- + +/*! + * \internal + * \brief Helper type to facilitate the wrapping of raw LAPACKE functions for different types into a single, overloaded C++ function. + * This is achieved in combination with \r EIGEN_MAKE_LAPACKE_WRAPPER + * \details This implementation works by providing an overloaded call function that just forwards its arguments to the + * underlying lapack function. Each of these overloads is enabled only if the call is actually well formed. + * Because these lapack functions take pointers to the underlying scalar type as arguments, even though the actual Scalars + * would be implicitly convertible, the pointers are not and therefore only a single overload can be valid at the same time. + * Thus, despite all functions taking fully generic `Args&&... args` as arguments, there is never any ambiguity. + */ +template +struct WrappingHelper { + // The naming of double, single, double complex and single complex is purely for readability + // and doesn't actually affect the workings of this class. In principle, the arguments can + // be supplied in any permuted order. + DoubleFn double_; SingleFn single_; DoubleCpxFn double_cpx_; SingleCpxFn single_cpx_; + + template + auto call(Args&&... args) -> decltype(double_(std::forward(args)...)) { + return double_(std::forward(args)...); + } + + template + auto call(Args&&... args) -> decltype(single_(std::forward(args)...)){ + return single_(std::forward(args)...); + } + + template + auto call(Args&&... args) -> decltype(double_cpx_(std::forward(args)...)){ + return double_cpx_(std::forward(args)...); + } + + template + auto call(Args&&... args) -> decltype(single_cpx_(std::forward(args)...)){ + return single_cpx_(std::forward(args)...); + } +}; + +/** \internal Helper function that generates a `WrappingHelper` object with the given function pointers and + * invokes its `call` method, thus selecting one of the overloads. + * \sa EIGEN_MAKE_LAPACKE_WRAPPER + */ +template +EIGEN_ALWAYS_INLINE auto call_wrapper(DoubleFn df, SingleFn sf, DoubleCpxFn dcf, SingleCpxFn scf, Args&&... args) { + WrappingHelper helper{df, sf, dcf, scf}; + return helper.call(std::forward(args)...); +} + +/** + * \internal + * Generates a new function `Function` that dispatches to the corresponding LAPACKE_? prefixed functions. + * \sa WrappingHelper + */ +#define EIGEN_MAKE_LAPACKE_WRAPPER(FUNCTION) \ +template \ +EIGEN_ALWAYS_INLINE auto FUNCTION(Args&&... args) { return call_wrapper(LAPACKE_d##FUNCTION, LAPACKE_s##FUNCTION, LAPACKE_z##FUNCTION, LAPACKE_c##FUNCTION, std::forward(args)...); } + +// Now with this macro and the helper wrappers, we can generate the dispatch for all the lapacke functions that are +// used in Eigen. +// We define these here instead of in the files where they are used because this allows us to #undef the macro again +// right here +EIGEN_MAKE_LAPACKE_WRAPPER(potrf) +EIGEN_MAKE_LAPACKE_WRAPPER(getrf) +EIGEN_MAKE_LAPACKE_WRAPPER(geqrf) +EIGEN_MAKE_LAPACKE_WRAPPER(gesdd) + +#undef EIGEN_MAKE_LAPACKE_WRAPPER +} +} +} + +#endif // EIGEN_LAPACKE_HELPERS_H diff --git a/libs/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/libs/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 0e5d544..30e3ee1 100644 --- a/libs/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/libs/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -30,15 +30,27 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa max() */ -EIGEN_MAKE_CWISE_BINARY_OP(min,min) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other * * \sa max() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, - const CwiseNullaryOp, PlainObject> > +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN min #else @@ -46,7 +58,7 @@ min #endif (const Scalar &other) const { - return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of \c *this and \a other @@ -56,14 +68,26 @@ min * * \sa min() */ -EIGEN_MAKE_CWISE_BINARY_OP(max,max) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other * * \sa min() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN max @@ -72,7 +96,7 @@ max #endif (const Scalar &other) const { - return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise absdiff of \c *this and \a other @@ -110,24 +134,12 @@ absolute_difference */ EIGEN_MAKE_CWISE_BINARY_OP(pow,pow) -#ifndef EIGEN_PARSED_BY_DOXYGEN -EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(pow,pow) -#else -/** \returns an expression of the coefficients of \c *this rasied to the constant power \a exponent +/** \returns an expression of the coefficient-wise atan2(\c *this, \a y), where \a y is the given array argument. * - * \tparam T is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression. + * This function computes the coefficient-wise atan2. * - * This function computes the coefficient-wise power. The function MatrixBase::pow() in the - * unsupported module MatrixFunctions computes the matrix power. - * - * Example: \include Cwise_pow.cpp - * Output: \verbinclude Cwise_pow.out - * - * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log() */ -template -const CwiseBinaryOp,Derived,Constant > pow(const T& exponent) const; -#endif +EIGEN_MAKE_CWISE_BINARY_OP(atan2,atan2) // TODO code generating macros could be moved to Macros.h and could include generation of documentation diff --git a/libs/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/libs/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 13c55f4..d8c1a84 100644 --- a/libs/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/libs/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -24,11 +24,9 @@ typedef CwiseUnaryOp, const Derived> AtanReturn typedef CwiseUnaryOp, const Derived> TanhReturnType; typedef CwiseUnaryOp, const Derived> LogisticReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; -#if EIGEN_HAS_CXX11_MATH typedef CwiseUnaryOp, const Derived> AtanhReturnType; typedef CwiseUnaryOp, const Derived> AsinhReturnType; typedef CwiseUnaryOp, const Derived> AcoshReturnType; -#endif typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; typedef CwiseUnaryOp, const Derived> CubeReturnType; @@ -355,7 +353,6 @@ cosh() const return CoshReturnType(derived()); } -#if EIGEN_HAS_CXX11_MATH /** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this. * * \sa Math functions, atanh(), asinh(), acosh() @@ -388,7 +385,6 @@ acosh() const { return AcoshReturnType(derived()); } -#endif /** \returns an expression of the coefficient-wise logistic of *this. */ @@ -694,3 +690,32 @@ ndtri() const { return NdtriReturnType(derived()); } + +template +using UnaryPowReturnType = + std::enable_if_t::Real>::value, + CwiseUnaryOp, const Derived>>; + +#ifndef EIGEN_PARSED_BY_DOXYGEN +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType pow( + const ScalarExponent& exponent) const { + return UnaryPowReturnType(derived(), internal::scalar_unary_pow_op(exponent)); +#else +/** \returns an expression of the coefficients of \c *this rasied to the constant power \a exponent + * + * \tparam T is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression. + * + * This function computes the coefficient-wise power. The function MatrixBase::pow() in the + * unsupported module MatrixFunctions computes the matrix power. + * + * Example: \include Cwise_pow.cpp + * Output: \verbinclude Cwise_pow.out + * + * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType pow( + const ScalarExponent& exponent) const; +#endif +} diff --git a/libs/eigen/Eigen/src/plugins/BlockMethods.h b/libs/eigen/Eigen/src/plugins/BlockMethods.h index 63a52a6..68b9413 100644 --- a/libs/eigen/Eigen/src/plugins/BlockMethods.h +++ b/libs/eigen/Eigen/src/plugins/BlockMethods.h @@ -1418,19 +1418,19 @@ innerVectors(Index outerStart, Index outerSize) const */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename internal::conditional::type +std::conditional_t subVector(Index i) { - return typename internal::conditional::type(derived(),i); + return std::conditional_t(derived(),i); } /** This is the const version of subVector(Index) */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename internal::conditional::type +std::conditional_t subVector(Index i) const { - return typename internal::conditional::type(derived(),i); + return std::conditional_t(derived(),i); } /** \returns the number of subvectors (rows or columns) in the direction \c Direction diff --git a/libs/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h b/libs/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h index 8b6730e..2f50329 100644 --- a/libs/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +++ b/libs/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h @@ -12,7 +12,7 @@ /** \returns an expression of the difference of \c *this and \a other * - * \note If you want to substract a given scalar from all coefficients, see Cwise::operator-(). + * \note If you want to subtract a given scalar from all coefficients, see Cwise::operator-(). * * \sa class CwiseBinaryOp, operator-=() */ diff --git a/libs/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h b/libs/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h index 5418dc4..390759c 100644 --- a/libs/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/libs/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -13,20 +13,20 @@ #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal the return type of conjugate() */ -typedef typename internal::conditional::IsComplex, - const CwiseUnaryOp, const Derived>, - const Derived& - >::type ConjugateReturnType; +typedef std::conditional_t::IsComplex, + const CwiseUnaryOp, const Derived>, + const Derived& + > ConjugateReturnType; /** \internal the return type of real() const */ -typedef typename internal::conditional::IsComplex, - const CwiseUnaryOp, const Derived>, - const Derived& - >::type RealReturnType; +typedef std::conditional_t::IsComplex, + const CwiseUnaryOp, const Derived>, + const Derived& + > RealReturnType; /** \internal the return type of real() */ -typedef typename internal::conditional::IsComplex, - CwiseUnaryView, Derived>, - Derived& - >::type NonConstRealReturnType; +typedef std::conditional_t::IsComplex, + CwiseUnaryView, Derived>, + Derived& + > NonConstRealReturnType; /** \internal the return type of imag() const */ typedef CwiseUnaryOp, const Derived> ImagReturnType; /** \internal the return type of imag() */ @@ -83,10 +83,10 @@ EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) /// \sa conjugate() template EIGEN_DEVICE_FUNC -inline typename internal::conditional::type +inline std::conditional_t conjugateIf() const { - typedef typename internal::conditional::type ReturnType; + typedef std::conditional_t ReturnType; return ReturnType(derived()); } diff --git a/libs/eigen/Eigen/src/plugins/IndexedViewMethods.h b/libs/eigen/Eigen/src/plugins/IndexedViewMethods.h index 5bfb19a..011fcbe 100644 --- a/libs/eigen/Eigen/src/plugins/IndexedViewMethods.h +++ b/libs/eigen/Eigen/src/plugins/IndexedViewMethods.h @@ -67,9 +67,9 @@ struct EIGEN_INDEXED_VIEW_METHOD_TYPE { // This is the generic version template -typename internal::enable_if::value +std::enable_if_t::value && internal::traits::type>::ReturnAsIndexedView, - typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type >::type + typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type> operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST { return typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type @@ -79,9 +79,9 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND // The following overload returns a Block<> object template -typename internal::enable_if::value +std::enable_if_t::value && internal::traits::type>::ReturnAsBlock, - typename internal::traits::type>::BlockType>::type + typename internal::traits::type>::BlockType> operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST { typedef typename internal::traits::type>::BlockType BlockType; @@ -90,23 +90,21 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices), - internal::size(actualRowIndices), - internal::size(actualColIndices)); + internal::index_list_size(actualRowIndices), + internal::index_list_size(actualColIndices)); } // The following overload returns a Scalar template -typename internal::enable_if::value +std::enable_if_t::value && internal::traits::type>::ReturnAsScalar, - CoeffReturnType >::type + CoeffReturnType > operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST { return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols())); } -#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE - // The following three overloads are needed to handle raw Index[N] arrays. template @@ -133,14 +131,13 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col (derived(), rowIndices, colIndices); } -#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE // Overloads for 1D vectors/arrays template -typename internal::enable_if< +std::enable_if_t< IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), - IndexedView::type> >::type + IndexedView::type> > operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -149,9 +146,9 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST } template -typename internal::enable_if< +std::enable_if_t< (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), - IndexedView::type,IvcIndex> >::type + IndexedView::type,IvcIndex> > operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -160,29 +157,27 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST } template -typename internal::enable_if< +std::enable_if_t< (internal::get_compile_time_incr::type>::value==1) && (!internal::is_valid_index_type::value) && (!symbolic::is_symbolic::value), - VectorBlock::value> >::type + VectorBlock::value> > operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) typename IvcType::type actualIndices = ivcSize(indices); return VectorBlock::value> - (derived(), internal::first(actualIndices), internal::size(actualIndices)); + (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices)); } template -typename internal::enable_if::value, CoeffReturnType >::type +std::enable_if_t::value, CoeffReturnType > operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST { return Base::operator()(internal::eval_expr_given_size(id,size())); } -#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE - template -typename internal::enable_if >::type +std::enable_if_t > operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -191,8 +186,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST } template -typename internal::enable_if >::type +std::enable_if_t > operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -200,8 +195,6 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST (derived(), indices, IvcIndex(0)); } -#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE - #undef EIGEN_INDEXED_VIEW_METHOD_CONST #undef EIGEN_INDEXED_VIEW_METHOD_TYPE @@ -218,7 +211,7 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST * * Each parameter must either be: * - An integer indexing a single row or column - * - Eigen::all indexing the full set of respective rows or columns in increasing order + * - Eigen::placeholders::all indexing the full set of respective rows or columns in increasing order * - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions * - Any %Eigen's vector/array of integers or expressions * - Plain C arrays: \c int[N] @@ -235,7 +228,7 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case * when all arguments are either: * - An integer - * - Eigen::all + * - Eigen::placeholders::all * - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N). * * Otherwise a more general IndexedView object will be returned, after conversion of the inputs diff --git a/libs/eigen/Eigen/src/plugins/InternalHeaderCheck.h b/libs/eigen/Eigen/src/plugins/InternalHeaderCheck.h new file mode 100644 index 0000000..ac6821d --- /dev/null +++ b/libs/eigen/Eigen/src/plugins/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_CORE_MODULE_H +#error "Please include Eigen/plugins instead of including headers inside the src directory directly." +#endif diff --git a/libs/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/libs/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h index a0feef8..46fe08c 100644 --- a/libs/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/libs/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -72,23 +72,24 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa class CwiseBinaryOp, max() */ -template +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise min of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMin(const Scalar &other) const { - return cwiseMin(Derived::Constant(rows(), cols(), other)); + return cwiseMin(Derived::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of *this and \a other @@ -98,23 +99,24 @@ cwiseMin(const Scalar &other) const * * \sa class CwiseBinaryOp, min() */ -template +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise max of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMax(const Scalar &other) const { - return cwiseMax(Derived::Constant(rows(), cols(), other)); + return cwiseMax(Derived::Constant(rows(), cols(), other)); } diff --git a/libs/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/libs/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h index 0514d8f..98d925d 100644 --- a/libs/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +++ b/libs/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h @@ -93,3 +93,13 @@ EIGEN_DOC_UNARY_ADDONS(cwiseArg,arg) EIGEN_DEVICE_FUNC inline const CwiseArgReturnType cwiseArg() const { return CwiseArgReturnType(derived()); } + +template +using CwisePowReturnType = + std::enable_if_t::Real>::value, + CwiseUnaryOp, const Derived>>; + +template +EIGEN_DEVICE_FUNC inline const CwisePowReturnType cwisePow(const ScalarExponent& exponent) const { + return CwisePowReturnType(derived(), internal::scalar_unary_pow_op(exponent)); +} diff --git a/libs/eigen/Eigen/src/plugins/ReshapedMethods.h b/libs/eigen/Eigen/src/plugins/ReshapedMethods.h index 482a6b0..2cb1cf6 100644 --- a/libs/eigen/Eigen/src/plugins/ReshapedMethods.h +++ b/libs/eigen/Eigen/src/plugins/ReshapedMethods.h @@ -105,13 +105,13 @@ EIGEN_DEVICE_FUNC inline Reshaped::value, internal::get_compiletime_reshape_size::value, - internal::get_compiletime_reshape_order::value> + internal::get_compiletime_reshape_order(Flags, Order)> reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST { return Reshaped::value, internal::get_compiletime_reshape_size::value, - internal::get_compiletime_reshape_order::value> + internal::get_compiletime_reshape_order(Flags, Order)> (derived(), internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); @@ -129,12 +129,12 @@ reshaped() EIGEN_RESHAPED_METHOD_CONST template EIGEN_DEVICE_FUNC inline Reshaped::value> + internal::get_compiletime_reshape_order(Flags, Order)> reshaped() EIGEN_RESHAPED_METHOD_CONST { EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER); return Reshaped::value> + internal::get_compiletime_reshape_order(Flags, Order)> (derived(), size(), 1); } diff --git a/libs/eigen/bench/btl/generic_bench/timers/portable_timer.hh b/libs/eigen/bench/btl/generic_bench/timers/portable_timer.hh old mode 100755 new mode 100644 diff --git a/libs/eigen/bench/btl/libs/STL/STL_interface.hh b/libs/eigen/bench/btl/libs/STL/STL_interface.hh index 16658c4..5b391c6 100644 --- a/libs/eigen/bench/btl/libs/STL/STL_interface.hh +++ b/libs/eigen/bench/btl/libs/STL/STL_interface.hh @@ -84,9 +84,12 @@ public : for (int j=0;j=j) + { for (int k=0;k cpqr(A.rows(),A.cols()); CompleteOrthogonalDecomposition cod(A.rows(),A.cols()); FullPivHouseholderQR fpqr(A.rows(),A.cols()); - JacobiSVD jsvd(A.rows(),A.cols()); - BDCSVD bdcsvd(A.rows(),A.cols()); + JacobiSVD jsvd(A.rows(),A.cols()); + BDCSVD bdcsvd(A.rows(),A.cols()); BENCH(t_llt, tries, rep, compute_norm_equation(llt,A)); BENCH(t_ldlt, tries, rep, compute_norm_equation(ldlt,A)); @@ -67,9 +65,9 @@ void bench(int id, int rows, int size = Size) if(size*rows<=10000000) BENCH(t_fpqr, tries, rep, compute(fpqr,A)); if(size<500) // JacobiSVD is really too slow for too large matrices - BENCH(t_jsvd, tries, rep, jsvd.compute(A,svd_opt)); + BENCH(t_jsvd, tries, rep, jsvd.compute(A)); // if(size*rows<=20000000) - BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,svd_opt)); + BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A)); results["LLT"][id] = t_llt.best(); results["LDLT"][id] = t_ldlt.best(); diff --git a/libs/eigen/bench/spbench/CMakeLists.txt b/libs/eigen/bench/spbench/CMakeLists.txt index b186004..75c36b0 100644 --- a/libs/eigen/bench/spbench/CMakeLists.txt +++ b/libs/eigen/bench/spbench/CMakeLists.txt @@ -1,7 +1,7 @@ -set(BLAS_FOUND TRUE) -set(LAPACK_FOUND TRUE) +set(BLAS_FOUND EIGEN_BUILD_BLAS) +set(LAPACK_FOUND EIGEN_BUILD_LAPACK) set(BLAS_LIBRARIES eigen_blas_static) set(LAPACK_LIBRARIES eigen_lapack_static) diff --git a/libs/eigen/bench/tensors/tensor_benchmarks.h b/libs/eigen/bench/tensors/tensor_benchmarks.h index 0825e15..1a7a0fe 100644 --- a/libs/eigen/bench/tensors/tensor_benchmarks.h +++ b/libs/eigen/bench/tensors/tensor_benchmarks.h @@ -219,14 +219,8 @@ template class BenchmarkSuite { size_b[1] = m_; TensorMap, Eigen::Aligned> B(b_, size_b); -#if defined(EIGEN_HAS_INDEX_LIST) Eigen::IndexPairList, Eigen::type2indexpair<2, 1> > paddings; -#else - Eigen::array, 2> paddings; - paddings[0] = Eigen::IndexPair(0, 0); - paddings[1] = Eigen::IndexPair(2, 1); -#endif #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { B.device(device_) = A.pad(paddings); @@ -251,15 +245,7 @@ template class BenchmarkSuite { size_b[1] = k_/2; TensorMap, Eigen::Aligned> B(b_, size_b); -#ifndef EIGEN_HAS_INDEX_LIST - Eigen::array strides; - strides[0] = 1; - strides[1] = 2; -#else - // Take advantage of cxx11 to give the compiler information it can use to - // optimize the code. Eigen::IndexList, Eigen::type2index<2> > strides; -#endif #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { @@ -284,17 +270,8 @@ template class BenchmarkSuite { size_c[0] = m_; size_c[1] = n_; TensorMap, Eigen::Aligned> C(c_, size_c); - -#ifndef EIGEN_HAS_INDEX_LIST - Eigen::array broadcast; - broadcast[0] = 1; - broadcast[1] = n_; -#else - // Take advantage of cxx11 to give the compiler information it can use to - // optimize the code. Eigen::IndexList, int> broadcast; broadcast.set(1, n_); -#endif #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { @@ -385,15 +362,7 @@ for (int iter = 0; iter < 10; ++iter) { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - -#ifndef EIGEN_HAS_INDEX_LIST - Eigen::array sum_along_dim; - sum_along_dim[0] = 0; -#else - // Take advantage of cxx11 to give the compiler information it can use to - // optimize the code. Eigen::IndexList> sum_along_dim; -#endif #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -564,9 +533,9 @@ for (int iter = 0; iter < 10; ++iter) { // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(T)); - device_.memset(b_, 23, k_ * n_ * sizeof(T)); - device_.memset(c_, 31, m_ * n_ * sizeof(T)); + device_.fill(a_, a_ + m_ * k_, T(12)); + device_.fill(b_, b_ + k_ * n_, T(23)); + device_.fill(c_, c_ + m_ * n_, T(31)); } diff --git a/libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc b/libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc index 8f2defe..c2d098e 100644 --- a/libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc +++ b/libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc @@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(T)); - device_.memset(b_, 23, k_ * n_ * sizeof(T)); - device_.memset(c_, 31, m_ * n_ * sizeof(T)); + device_.fill(a_, m_ * k_, T(12)); + device_.fill(b_, k_ * n_, T(23)); + device_.fill(c_, m_ * n_, T(31)); Eigen::array sizeA; sizeA[0] = m_; diff --git a/libs/eigen/blas/BandTriangularSolver.h b/libs/eigen/blas/BandTriangularSolver.h index ce2d74d..e8551cd 100644 --- a/libs/eigen/blas/BandTriangularSolver.h +++ b/libs/eigen/blas/BandTriangularSolver.h @@ -29,11 +29,9 @@ struct band_solve_triangular_selector(lhsStride)); RhsMap other(_other,size,1); - typename internal::conditional< - ConjLhs, - const CwiseUnaryOp,LhsMap>, - const LhsMap&> - ::type cjLhs(lhs); + std::conditional_t,LhsMap>, + const LhsMap&> cjLhs(lhs); for(int col=0 ; col(lhsStride)); RhsMap other(_other,size,1); - typename internal::conditional< - ConjLhs, - const CwiseUnaryOp,LhsMap>, - const LhsMap&> - ::type cjLhs(lhs); + std::conditional_t,LhsMap>, + const LhsMap&> cjLhs(lhs); for(int col=0 ; col::type ConjLhsType; typedef Map > ResMap; - for (Index i=0; i0)) - ResMap(res+(IsLower ? s+i : 0),r) += alpha * cj(rhs[i]) * ConjLhsType(LhsMap(lhs+s,r)); - if (HasUnitDiag) - res[i] += alpha * cj(rhs[i]); - lhs += IsLower ? size-i: i+1; + for (Index i = 0; i < size; ++i) { + Index s = IsLower && (HasUnitDiag || HasZeroDiag) ? 1 : 0; + Index r = IsLower ? size - i : i + 1; + if (!(HasUnitDiag || HasZeroDiag) || (--r > 0)) { + ResMap(res + (IsLower ? s + i : 0), r) += alpha * cj(rhs[i]) * ConjLhsType(LhsMap(lhs + s, r)); + } + if (HasUnitDiag) { + res[i] += alpha * cj(rhs[i]); + } + lhs += IsLower ? size - i : i + 1; } }; }; @@ -61,15 +62,16 @@ struct packed_triangular_matrix_vector_product > RhsMap; typedef typename conj_expr_if::type ConjRhsType; - for (Index i=0; i0)) - res[i] += alpha * (ConjLhsType(LhsMap(lhs+s,r)).cwiseProduct(ConjRhsType(RhsMap(rhs+(IsLower ? 0 : s+i),r)))).sum(); - if (HasUnitDiag) - res[i] += alpha * cj(rhs[i]); - lhs += IsLower ? i+1 : size-i; + for (Index i = 0; i < size; ++i) { + Index s = !IsLower && (HasUnitDiag || HasZeroDiag) ? 1 : 0; + Index r = IsLower ? i + 1 : size - i; + if (!(HasUnitDiag || HasZeroDiag) || (--r > 0)) { + res[i] += alpha * (ConjLhsType(LhsMap(lhs + s, r)).cwiseProduct(ConjRhsType(RhsMap(rhs + (IsLower ? 0 : s + i), r)))).sum(); + } + if (HasUnitDiag) { + res[i] += alpha * cj(rhs[i]); + } + lhs += IsLower ? i + 1 : size - i; } }; }; diff --git a/libs/eigen/blas/common.h b/libs/eigen/blas/common.h index a9b6978..a938cb1 100644 --- a/libs/eigen/blas/common.h +++ b/libs/eigen/blas/common.h @@ -149,7 +149,7 @@ T* get_compact_vector(T* x, int n, int incx) if(incx==1) return x; - typename Eigen::internal::remove_const::type* ret = new Scalar[n]; + std::remove_const_t* ret = new Scalar[n]; if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse(); else make_vector(ret,n) = make_vector(x,n, incx); return ret; diff --git a/libs/eigen/blas/level1_cplx_impl.h b/libs/eigen/blas/level1_cplx_impl.h index 6c7edd7..aa46784 100644 --- a/libs/eigen/blas/level1_cplx_impl.h +++ b/libs/eigen/blas/level1_cplx_impl.h @@ -11,7 +11,6 @@ struct scalar_norm1_op { typedef RealScalar result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_norm1_op) inline RealScalar operator() (const Scalar& a) const { return numext::norm1(a); } }; namespace Eigen { diff --git a/libs/eigen/blas/level3_impl.h b/libs/eigen/blas/level3_impl.h index 6dd6338..66216c9 100644 --- a/libs/eigen/blas/level3_impl.h +++ b/libs/eigen/blas/level3_impl.h @@ -362,18 +362,18 @@ int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, const Scalar&, internal::level3_blocking&); static const functype func[8] = { // array index: NOTR | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: TR | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: ADJ | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), 0, // array index: NOTR | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: TR | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: ADJ | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), 0 }; #endif diff --git a/libs/eigen/ci/README.md b/libs/eigen/ci/README.md index 8395b16..6a63eef 100644 --- a/libs/eigen/ci/README.md +++ b/libs/eigen/ci/README.md @@ -9,48 +9,30 @@ The build stage consists of the following jobs: | Job Name | Arch | OS | Compiler | C++11 | |------------------------------------------|-----------|----------------|------------|---------| -| `build:x86-64:linux:gcc-4.8:cxx11-off` | `x86-64` | `Ubuntu 18.04` | `GCC-4.8` | `Off` | | `build:x86-64:linux:gcc-4.8:cxx11-on` | `x86-64` | `Ubuntu 18.04` | `GCC-4.8` | `On` | -| `build:x86-64:linux:gcc-9:cxx11-off` | `x86-64` | `Ubuntu 18.04` | `GCC-9` | `Off` | | `build:x86-64:linux:gcc-9:cxx11-on` | `x86-64` | `Ubuntu 18.04` | `GCC-9` | `On` | -| `build:x86-64:linux:gcc-10:cxx11-off` | `x86-64` | `Ubuntu 18.04` | `GCC-10` | `Off` | | `build:x86-64:linux:gcc-10:cxx11-on` | `x86-64` | `Ubuntu 18.04` | `GCC-10` | `On` | -| `build:x86-64:linux:clang-10:cxx11-off` | `x86-64` | `Ubuntu 18.04` | `Clang-10` | `Off` | | `build:x86-64:linux:clang-10:cxx11-on` | `x86-64` | `Ubuntu 18.04` | `Clang-10` | `On` | -| `build:aarch64:linux:gcc-10:cxx11-off` | `AArch64` | `Ubuntu 18.04` | `GCC-10` | `Off` | | `build:aarch64:linux:gcc-10:cxx11-on` | `AArch64` | `Ubuntu 18.04` | `GCC-10` | `On` | -| `build:aarch64:linux:clang-10:cxx11-off` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off` | | `build:aarch64:linux:clang-10:cxx11-on` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On` | ### Test stage In principle every build-job has a corresponding test-job, however testing supported and unsupported modules is divided into separate jobs. The test jobs in detail: -### Job dependecies +### Job dependencies | Job Name | Arch | OS | Compiler | C++11 | Module |-----------------------------------------------------|-----------|----------------|------------|---------|-------- -| `test:x86-64:linux:gcc-4.8:cxx11-off:official` | `x86-64` | `Ubuntu 18.04` | `GCC-4.8` | `Off` | `Official` -| `test:x86-64:linux:gcc-4.8:cxx11-off:unsupported` | `x86-64` | `Ubuntu 18.04` | `GCC-4.8` | `Off` | `Unsupported` | `test:x86-64:linux:gcc-4.8:cxx11-on:official` | `x86-64` | `Ubuntu 18.04` | `GCC-4.8` | `On` | `Official` | `test:x86-64:linux:gcc-4.8:cxx11-on:unsupported` | `x86-64` | `Ubuntu 18.04` | `GCC-4.8` | `On` | `Unsupported` -| `test:x86-64:linux:gcc-9:cxx11-off:official` | `x86-64` | `Ubuntu 18.04` | `GCC-9` | `Off` | `Official` -| `test:x86-64:linux:gcc-9:cxx11-off:unsupported` | `x86-64` | `Ubuntu 18.04` | `GCC-9` | `Off` | `Unsupported` | `test:x86-64:linux:gcc-9:cxx11-on:official` | `x86-64` | `Ubuntu 18.04` | `GCC-9` | `On` | `Official` | `test:x86-64:linux:gcc-9:cxx11-on:unsupported` | `x86-64` | `Ubuntu 18.04` | `GCC-9` | `On` | `Unsupported` -| `test:x86-64:linux:gcc-10:cxx11-off:official` | `x86-64` | `Ubuntu 18.04` | `GCC-10` | `Off` | `Official` -| `test:x86-64:linux:gcc-10:cxx11-off:unsupported` | `x86-64` | `Ubuntu 18.04` | `GCC-10` | `Off` | `Unsupported` | `test:x86-64:linux:gcc-10:cxx11-on:official` | `x86-64` | `Ubuntu 18.04` | `GCC-10` | `On` | `Official` | `test:x86-64:linux:gcc-10:cxx11-on:unsupported` | `x86-64` | `Ubuntu 18.04` | `GCC-10` | `On` | `Unsupported` -| `test:x86-64:linux:clang-10:cxx11-off:official` | `x86-64` | `Ubuntu 18.04` | `Clang-10` | `Off` | `Official` -| `test:x86-64:linux:clang-10:cxx11-off:unsupported` | `x86-64` | `Ubuntu 18.04` | `Clang-10` | `Off` | `Unsupported` | `test:x86-64:linux:clang-10:cxx11-on:official` | `x86-64` | `Ubuntu 18.04` | `Clang-10` | `On` | `Official` | `test:x86-64:linux:clang-10:cxx11-on:unsupported` | `x86-64` | `Ubuntu 18.04` | `Clang-10` | `On` | `Unsupported` -| `test:aarch64:linux:gcc-10:cxx11-off:official` | `AArch64` | `Ubuntu 18.04` | `GCC-10` | `Off` | `Official` -| `test:aarch64:linux:gcc-10:cxx11-off:unsupported` | `AArch64` | `Ubuntu 18.04` | `GCC-10` | `Off` | `Unsupported` | `test:aarch64:linux:gcc-10:cxx11-on:official` | `AArch64` | `Ubuntu 18.04` | `GCC-10` | `On` | `Official` | `test:aarch64:linux:gcc-10:cxx11-on:unsupported` | `AArch64` | `Ubuntu 18.04` | `GCC-10` | `On` | `Unsupported` -| `test:aarch64:linux:clang-10:cxx11-off:official` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off` | `Official` -| `test:aarch64:linux:clang-10:cxx11-off:unsupported` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off` | `Unsupported` | `test:aarch64:linux:clang-10:cxx11-on:official` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On` | `Official` | `test:aarch64:linux:clang-10:cxx11-on:unsupported` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On` | `Unsupported` diff --git a/libs/eigen/ci/build.gitlab-ci.yml b/libs/eigen/ci/build.gitlab-ci.yml index 6b9f415..073212a 100644 --- a/libs/eigen/ci/build.gitlab-ci.yml +++ b/libs/eigen/ci/build.gitlab-ci.yml @@ -23,41 +23,19 @@ - schedules ######## x86-64 ################################################################ -# GCC-4.8 (the oldest compiler we support) -build:x86-64:linux:gcc-4.8:cxx11-off: - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "g++-4.8" - EIGEN_CI_CC_COMPILER: "gcc-4.8" - EIGEN_TEST_CXX11: "off" - tags: - - eigen-runner - - linux - - x86-64 - -build:x86-64:linux:gcc-4.8:cxx11-on: - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "g++-4.8" - EIGEN_CI_CC_COMPILER: "gcc-4.8" - EIGEN_TEST_CXX11: "on" - tags: - - eigen-runner - - linux - - x86-64 +# # GCC-4.8 (the oldest compiler we support) +# build:x86-64:linux:gcc-4.8:cxx11-on: +# extends: .build:linux:base +# variables: +# EIGEN_CI_CXX_COMPILER: "g++-4.8" +# EIGEN_CI_CC_COMPILER: "gcc-4.8" +# EIGEN_TEST_CXX11: "on" +# tags: +# - eigen-runner +# - linux +# - x86-64 # GCC-9 -build:x86-64:linux:gcc-9:cxx11-off: - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "g++-9" - EIGEN_CI_CC_COMPILER: "gcc-9" - EIGEN_TEST_CXX11: "off" - tags: - - eigen-runner - - linux - - x86-64 - build:x86-64:linux:gcc-9:cxx11-on: extends: .build:linux:base variables: @@ -70,17 +48,6 @@ build:x86-64:linux:gcc-9:cxx11-on: - x86-64 # GCC-10 -build:x86-64:linux:gcc-10:cxx11-off: - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "g++-10" - EIGEN_CI_CC_COMPILER: "gcc-10" - EIGEN_TEST_CXX11: "off" - tags: - - eigen-runner - - linux - - x86-64 - build:x86-64:linux:gcc-10:cxx11-on: extends: .build:linux:base variables: @@ -93,17 +60,6 @@ build:x86-64:linux:gcc-10:cxx11-on: - x86-64 # Clang-10 -build:x86-64:linux:clang-10:cxx11-off: - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "clang++-10" - EIGEN_CI_CC_COMPILER: "clang-10" - EIGEN_TEST_CXX11: "off" - tags: - - eigen-runner - - linux - - x86-64 - build:x86-64:linux:clang-10:cxx11-on: extends: .build:linux:base variables: @@ -115,19 +71,22 @@ build:x86-64:linux:clang-10:cxx11-on: - linux - x86-64 -######## AArch64 ############################################################### -# GCC-10 -build:aarch64:linux:gcc-10:cxx11-off: +# Clang-10, AVX512 +build:x86-64:linux:clang-10:cxx11-on:avx512: extends: .build:linux:base variables: - EIGEN_CI_CXX_COMPILER: "g++-10" - EIGEN_CI_CC_COMPILER: "gcc-10" - EIGEN_TEST_CXX11: "off" + EIGEN_CI_CXX_COMPILER: "clang++-10" + EIGEN_CI_CC_COMPILER: "clang-10" + EIGEN_TEST_CXX11: "on" + EIGEN_TEST_AVX512DQ: "on" tags: - eigen-runner - linux - - aarch64 + - x86-64 + - avx512 +######## AArch64 ############################################################### +# GCC-10 build:aarch64:linux:gcc-10:cxx11-on: extends: .build:linux:base variables: @@ -139,18 +98,7 @@ build:aarch64:linux:gcc-10:cxx11-on: - linux - aarch64 -# Clang-10 -build:aarch64:linux:clang-10:cxx11-off: - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "clang++-10" - EIGEN_CI_CC_COMPILER: "clang-10" - EIGEN_TEST_CXX11: "off" - tags: - - eigen-runner - - linux - - aarch64 - +# # Clang-10 build:aarch64:linux:clang-10:cxx11-on: extends: .build:linux:base variables: @@ -166,18 +114,6 @@ build:aarch64:linux:clang-10:cxx11-on: # Currently all ppc64le jobs are allowed to fail # GCC-10 -build:ppc64le:linux:gcc-10:cxx11-off: - allow_failure: true - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "g++-10" - EIGEN_CI_CC_COMPILER: "gcc-10" - EIGEN_TEST_CXX11: "off" - tags: - - eigen-runner - - linux - - ppc64le - build:ppc64le:linux:gcc-10:cxx11-on: allow_failure: true extends: .build:linux:base @@ -185,24 +121,13 @@ build:ppc64le:linux:gcc-10:cxx11-on: EIGEN_CI_CXX_COMPILER: "g++-10" EIGEN_CI_CC_COMPILER: "gcc-10" EIGEN_TEST_CXX11: "on" + EIGEN_CI_ADDITIONAL_ARGS: "-DCMAKE_CXX_FLAGS='-DEIGEN_ALTIVEC_DISABLE_MMA'" tags: - eigen-runner - linux - ppc64le -# # Clang-10 -build:ppc64le:linux:clang-10:cxx11-off: - allow_failure: true - extends: .build:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "clang++-10" - EIGEN_CI_CC_COMPILER: "clang-10" - EIGEN_TEST_CXX11: "off" - tags: - - eigen-runner - - linux - - ppc64le - +# Clang-10 build:ppc64le:linux:clang-10:cxx11-on: allow_failure: true extends: .build:linux:base diff --git a/libs/eigen/ci/smoketests.gitlab-ci.yml b/libs/eigen/ci/smoketests.gitlab-ci.yml index 6384f10..c69d392 100644 --- a/libs/eigen/ci/smoketests.gitlab-ci.yml +++ b/libs/eigen/ci/smoketests.gitlab-ci.yml @@ -22,13 +22,6 @@ only: - merge_requests -buildsmoketests:x86-64:linux:gcc-10:cxx11-off: - extends: .buildsmoketests:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "g++-10" - EIGEN_CI_CC_COMPILER: "gcc-10" - EIGEN_TEST_CXX11: "off" - buildsmoketests:x86-64:linux:gcc-10:cxx11-on: extends: .buildsmoketests:linux:base variables: @@ -36,13 +29,6 @@ buildsmoketests:x86-64:linux:gcc-10:cxx11-on: EIGEN_CI_CC_COMPILER: "gcc-10" EIGEN_TEST_CXX11: "on" -buildsmoketests:x86-64:linux:clang-10:cxx11-off: - extends: .buildsmoketests:linux:base - variables: - EIGEN_CI_CXX_COMPILER: "clang++-10" - EIGEN_CI_CC_COMPILER: "clang-10" - EIGEN_TEST_CXX11: "off" - buildsmoketests:x86-64:linux:clang-10:cxx11-on: extends: .buildsmoketests:linux:base variables: @@ -61,9 +47,11 @@ buildsmoketests:x86-64:linux:clang-10:cxx11-on: - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER} ${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc script: + - export NPROC=`nproc` + - echo ${NPROC} - export CXX=${EIGEN_CI_CXX_COMPILER} - export CC=${EIGEN_CI_CC_COMPILER} - - cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output + - cd ${BUILDDIR} && ctest -j${NPROC} --output-on-failure --no-compress-output --build-no-clean -T test -L smoketest after_script: - apt-get update -y @@ -78,13 +66,6 @@ buildsmoketests:x86-64:linux:clang-10:cxx11-on: only: - merge_requests -smoketests:x86-64:linux:gcc-10:cxx11-off: - extends: .smoketests:linux:base - variables: - EIGEN_CI_CXX_COMPILER: g++-10 - EIGEN_CI_CC_COMPILER: gcc-10 - needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-off" ] - smoketests:x86-64:linux:gcc-10:cxx11-on: extends: .smoketests:linux:base variables: @@ -92,13 +73,6 @@ smoketests:x86-64:linux:gcc-10:cxx11-on: EIGEN_CI_CC_COMPILER: gcc-10 needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-on" ] -smoketests:x86-64:linux:clang-10:cxx11-off: - extends: .smoketests:linux:base - variables: - EIGEN_CI_CXX_COMPILER: clang++-10 - EIGEN_CI_CC_COMPILER: clang-10 - needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-off" ] - smoketests:x86-64:linux:clang-10:cxx11-on: extends: .smoketests:linux:base variables: diff --git a/libs/eigen/ci/test.gitlab-ci.yml b/libs/eigen/ci/test.gitlab-ci.yml index 2a0f5dd..63012bd 100644 --- a/libs/eigen/ci/test.gitlab-ci.yml +++ b/libs/eigen/ci/test.gitlab-ci.yml @@ -10,9 +10,11 @@ - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER} ${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc script: + - export NPROC=`nproc` + - echo ${NPROC} - export CXX=${EIGEN_CI_CXX_COMPILER} - export CC=${EIGEN_CI_CC_COMPILER} - - cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output + - cd ${BUILDDIR} && ctest -j${NPROC} --output-on-failure --no-compress-output --build-no-clean -T test -L ${EIGEN_CI_TEST_LABEL} after_script: - apt-get update -y @@ -28,78 +30,36 @@ - schedules ##### x86-64 ################################################################### -# GCC-4.8 -.test:x86-64:linux:gcc-4.8:cxx11-off: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: g++-4.8 - EIGEN_CI_CC_COMPILER: gcc-4.8 - needs: [ "build:x86-64:linux:gcc-4.8:cxx11-off" ] - tags: - - eigen-runner - - linux - - x86-64 +# # GCC-4.8 +# .test:x86-64:linux:gcc-4.8:cxx11-on: +# extends: .test:linux:base +# variables: +# EIGEN_CI_CXX_COMPILER: g++-4.8 +# EIGEN_CI_CC_COMPILER: gcc-4.8 +# needs: [ "build:x86-64:linux:gcc-4.8:cxx11-on" ] +# tags: +# - eigen-runner +# - linux +# - x86-64 -test:x86-64:linux:gcc-4.8:cxx11-off:official: - extends: .test:x86-64:linux:gcc-4.8:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Official" +# test:x86-64:linux:gcc-4.8:cxx11-on:official: +# extends: .test:x86-64:linux:gcc-4.8:cxx11-on +# variables: +# EIGEN_CI_TEST_LABEL: "Official" -test:x86-64:linux:gcc-4.8:cxx11-off:unsupported: - extends: .test:x86-64:linux:gcc-4.8:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" - -.test:x86-64:linux:gcc-4.8:cxx11-on: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: g++-4.8 - EIGEN_CI_CC_COMPILER: gcc-4.8 - needs: [ "build:x86-64:linux:gcc-4.8:cxx11-on" ] - tags: - - eigen-runner - - linux - - x86-64 - -test:x86-64:linux:gcc-4.8:cxx11-on:official: - extends: .test:x86-64:linux:gcc-4.8:cxx11-on - variables: - EIGEN_CI_TEST_LABEL: "Official" - -test:x86-64:linux:gcc-4.8:cxx11-on:unsupported: - extends: .test:x86-64:linux:gcc-4.8:cxx11-on - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" +# test:x86-64:linux:gcc-4.8:cxx11-on:unsupported: +# extends: .test:x86-64:linux:gcc-4.8:cxx11-on +# variables: +# EIGEN_CI_TEST_LABEL: "Unsupported" # GCC-9 -.test:x86-64:linux:gcc-9:cxx11-off: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: g++-9 - EIGEN_CI_CC_COMPILER: gcc-9 - needs: [ "build:x86-64:linux:gcc-9:cxx11-off" ] - tags: - - eigen-runner - - linux - - x86-64 - -test:x86-64:linux:gcc-9:cxx11-off:official: - extends: .test:x86-64:linux:gcc-9:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Official" - -test:x86-64:linux:gcc-9:cxx11-off:unsupported: - extends: .test:x86-64:linux:gcc-9:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" - .test:x86-64:linux:gcc-9:cxx11-on: extends: .test:linux:base variables: EIGEN_CI_CXX_COMPILER: g++-9 EIGEN_CI_CC_COMPILER: gcc-9 needs: [ "build:x86-64:linux:gcc-9:cxx11-on" ] - tags: + tags: - eigen-runner - linux - x86-64 @@ -115,35 +75,13 @@ test:x86-64:linux:gcc-9:cxx11-on:unsupported: EIGEN_CI_TEST_LABEL: "Unsupported" # GCC-10 -.test:x86-64:linux:gcc-10:cxx11-off: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: g++-10 - EIGEN_CI_CC_COMPILER: gcc-10 - needs: [ "build:x86-64:linux:gcc-10:cxx11-off" ] - tags: - - eigen-runner - - linux - - x86-64 - -test:x86-64:linux:gcc-10:cxx11-off:official: - extends: .test:x86-64:linux:gcc-10:cxx11-off - allow_failure: true - variables: - EIGEN_CI_TEST_LABEL: "Official" - -test:x86-64:linux:gcc-10:cxx11-off:unsupported: - extends: .test:x86-64:linux:gcc-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" - .test:x86-64:linux:gcc-10:cxx11-on: extends: .test:linux:base variables: EIGEN_CI_CXX_COMPILER: g++-10 EIGEN_CI_CC_COMPILER: gcc-10 needs: [ "build:x86-64:linux:gcc-10:cxx11-on" ] - tags: + tags: - eigen-runner - linux - x86-64 @@ -161,34 +99,13 @@ test:x86-64:linux:gcc-10:cxx11-on:unsupported: EIGEN_CI_TEST_LABEL: "Unsupported" # Clang 10 -.test:x86-64:linux:clang-10:cxx11-off: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: clang++-10 - EIGEN_CI_CC_COMPILER: clang-10 - needs: [ "build:x86-64:linux:clang-10:cxx11-off" ] - tags: - - eigen-runner - - linux - - x86-64 - -test:x86-64:linux:clang-10:cxx11-off:official: - extends: .test:x86-64:linux:clang-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Official" - -test:x86-64:linux:clang-10:cxx11-off:unsupported: - extends: .test:x86-64:linux:clang-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" - .test:x86-64:linux:clang-10:cxx11-on: extends: .test:linux:base variables: EIGEN_CI_CXX_COMPILER: clang++-10 EIGEN_CI_CC_COMPILER: clang-10 needs: [ "build:x86-64:linux:clang-10:cxx11-on" ] - tags: + tags: - eigen-runner - linux - x86-64 @@ -203,38 +120,37 @@ test:x86-64:linux:clang-10:cxx11-on:unsupported: variables: EIGEN_CI_TEST_LABEL: "Unsupported" -##### AArch64 ################################################################## -# GCC-10 -.test:aarch64:linux:gcc-10:cxx11-off: +.test:x86-64:linux:clang-10:cxx11-on:avx512: extends: .test:linux:base variables: - EIGEN_CI_CXX_COMPILER: g++-10 - EIGEN_CI_CC_COMPILER: gcc-10 - needs: [ "build:aarch64:linux:gcc-10:cxx11-off" ] - tags: + EIGEN_CI_CXX_COMPILER: clang++-10 + EIGEN_CI_CC_COMPILER: clang-10 + needs: [ "build:x86-64:linux:clang-10:cxx11-on:avx512" ] + tags: - eigen-runner - linux - - aarch64 + - x86-64 + - avx512 -test:aarch64:linux:gcc-10:cxx11-off:official: - extends: .test:aarch64:linux:gcc-10:cxx11-off - allow_failure: true +test:x86-64:linux:clang-10:cxx11-on:avx512:official: + extends: .test:x86-64:linux:clang-10:cxx11-on:avx512 variables: EIGEN_CI_TEST_LABEL: "Official" -test:aarch64:linux:gcc-10:cxx11-off:unsupported: - extends: .test:aarch64:linux:gcc-10:cxx11-off - allow_failure: true +test:x86-64:linux:clang-10:cxx11-on:avx512:unsupported: + extends: .test:x86-64:linux:clang-10:cxx11-on:avx512 variables: EIGEN_CI_TEST_LABEL: "Unsupported" +##### AArch64 ################################################################## +# GCC-10 .test:aarch64:linux:gcc-10:cxx11-on: extends: .test:linux:base variables: EIGEN_CI_CXX_COMPILER: g++-10 EIGEN_CI_CC_COMPILER: gcc-10 needs: [ "build:aarch64:linux:gcc-10:cxx11-on" ] - tags: + tags: - eigen-runner - linux - aarch64 @@ -252,35 +168,13 @@ test:aarch64:linux:gcc-10:cxx11-on:unsupported: EIGEN_CI_TEST_LABEL: "Unsupported" # Clang 10 -.test:aarch64:linux:clang-10:cxx11-off: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: clang++-10 - EIGEN_CI_CC_COMPILER: clang-10 - needs: [ "build:aarch64:linux:clang-10:cxx11-off" ] - tags: - - eigen-runner - - linux - - aarch64 - -test:aarch64:linux:clang-10:cxx11-off:official: - extends: .test:aarch64:linux:clang-10:cxx11-off - allow_failure: true - variables: - EIGEN_CI_TEST_LABEL: "Official" - -test:aarch64:linux:clang-10:cxx11-off:unsupported: - extends: .test:aarch64:linux:clang-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" - .test:aarch64:linux:clang-10:cxx11-on: extends: .test:linux:base variables: EIGEN_CI_CXX_COMPILER: clang++-10 EIGEN_CI_CC_COMPILER: clang-10 needs: [ "build:aarch64:linux:clang-10:cxx11-on" ] - tags: + tags: - eigen-runner - linux - aarch64 @@ -298,28 +192,6 @@ test:aarch64:linux:clang-10:cxx11-on:unsupported: ##### ppc64le ################################################################## # GCC-10 -.test:ppc64le:linux:gcc-10:cxx11-off: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: g++-10 - EIGEN_CI_CC_COMPILER: gcc-10 - needs: [ "build:ppc64le:linux:gcc-10:cxx11-off" ] - allow_failure: true - tags: - - eigen-runner - - linux - - ppc64le - -test:ppc64le:linux:gcc-10:cxx11-off:official: - extends: .test:ppc64le:linux:gcc-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Official" - -test:ppc64le:linux:gcc-10:cxx11-off:unsupported: - extends: .test:ppc64le:linux:gcc-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" - .test:ppc64le:linux:gcc-10:cxx11-on: extends: .test:linux:base variables: @@ -327,7 +199,7 @@ test:ppc64le:linux:gcc-10:cxx11-off:unsupported: EIGEN_CI_CC_COMPILER: gcc-10 needs: [ "build:ppc64le:linux:gcc-10:cxx11-on" ] allow_failure: true - tags: + tags: - eigen-runner - linux - ppc64le @@ -342,29 +214,7 @@ test:ppc64le:linux:gcc-10:cxx11-on:unsupported: variables: EIGEN_CI_TEST_LABEL: "Unsupported" -# # Clang 10 -.test:ppc64le:linux:clang-10:cxx11-off: - extends: .test:linux:base - variables: - EIGEN_CI_CXX_COMPILER: clang++-10 - EIGEN_CI_CC_COMPILER: clang-10 - needs: [ "build:ppc64le:linux:clang-10:cxx11-off" ] - allow_failure: true - tags: - - eigen-runner - - linux - - ppc64le - -test:ppc64le:linux:clang-10:cxx11-off:official: - extends: .test:ppc64le:linux:clang-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Official" - -test:ppc64le:linux:clang-10:cxx11-off:unsupported: - extends: .test:ppc64le:linux:clang-10:cxx11-off - variables: - EIGEN_CI_TEST_LABEL: "Unsupported" - +# Clang 10 .test:ppc64le:linux:clang-10:cxx11-on: extends: .test:linux:base variables: @@ -372,7 +222,7 @@ test:ppc64le:linux:clang-10:cxx11-off:unsupported: EIGEN_CI_CC_COMPILER: clang-10 needs: [ "build:ppc64le:linux:clang-10:cxx11-on" ] allow_failure: true - tags: + tags: - eigen-runner - linux - ppc64le diff --git a/libs/eigen/cmake/Eigen3Config.cmake.in b/libs/eigen/cmake/Eigen3Config.cmake.in index 0a1ac61..96582f5 100644 --- a/libs/eigen/cmake/Eigen3Config.cmake.in +++ b/libs/eigen/cmake/Eigen3Config.cmake.in @@ -3,21 +3,6 @@ @PACKAGE_INIT@ -if (NOT TARGET eigen) +if (NOT TARGET Eigen3::Eigen) include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake") -endif () - -# Legacy variables, do *not* use. May be removed in the future. - -set (EIGEN3_FOUND 1) -set (EIGEN3_USE_FILE "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake") - -set (EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@") -set (EIGEN3_INCLUDE_DIR "@PACKAGE_EIGEN_INCLUDE_DIR@") -set (EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@") -set (EIGEN3_ROOT_DIR "@PACKAGE_EIGEN_ROOT_DIR@") - -set (EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@") -set (EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@") -set (EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@") -set (EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@") +endif (NOT TARGET Eigen3::Eigen) diff --git a/libs/eigen/cmake/Eigen3ConfigLegacy.cmake.in b/libs/eigen/cmake/Eigen3ConfigLegacy.cmake.in deleted file mode 100644 index 62d7224..0000000 --- a/libs/eigen/cmake/Eigen3ConfigLegacy.cmake.in +++ /dev/null @@ -1,30 +0,0 @@ -# -*- cmake -*- -# -# Eigen3Config.cmake(.in) - -# Use the following variables to compile and link against Eigen: -# EIGEN3_FOUND - True if Eigen was found on your system -# EIGEN3_USE_FILE - The file making Eigen usable -# EIGEN3_DEFINITIONS - Definitions needed to build with Eigen -# EIGEN3_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found -# EIGEN3_INCLUDE_DIRS - List of directories of Eigen and it's dependencies -# EIGEN3_ROOT_DIR - The base directory of Eigen -# EIGEN3_VERSION_STRING - A human-readable string containing the version -# EIGEN3_VERSION_MAJOR - The major version of Eigen -# EIGEN3_VERSION_MINOR - The minor version of Eigen -# EIGEN3_VERSION_PATCH - The patch version of Eigen - -@PACKAGE_INIT@ - -set ( EIGEN3_FOUND 1 ) -set ( EIGEN3_USE_FILE "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake" ) - -set ( EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@" ) -set ( EIGEN3_INCLUDE_DIR "@PACKAGE_EIGEN_INCLUDE_DIR@" ) -set ( EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@" ) -set ( EIGEN3_ROOT_DIR "@PACKAGE_EIGEN_ROOT_DIR@" ) - -set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" ) -set ( EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" ) -set ( EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@" ) -set ( EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@" ) diff --git a/libs/eigen/cmake/EigenConfigureTesting.cmake b/libs/eigen/cmake/EigenConfigureTesting.cmake index 9cb3bb2..2a1e7ab 100644 --- a/libs/eigen/cmake/EigenConfigureTesting.cmake +++ b/libs/eigen/cmake/EigenConfigureTesting.cmake @@ -8,9 +8,18 @@ ei_set_sitename() ei_set_build_string() add_custom_target(buildtests) -add_custom_target(check COMMAND "ctest") +add_custom_target(check COMMAND "ctest" ${EIGEN_CTEST_ARGS}) add_dependencies(check buildtests) +# Convenience target for only building GPU tests. +add_custom_target(buildtests_gpu) +add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" + "--no-compress-output" + "--build-no-clean" + "-T" "test" + "-L" "gpu") +add_dependencies(check_gpu buildtests_gpu) + # check whether /bin/bash exists (disabled as not used anymore) # find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH) diff --git a/libs/eigen/cmake/EigenDetermineOSVersion.cmake b/libs/eigen/cmake/EigenDetermineOSVersion.cmake deleted file mode 100644 index 9246fa6..0000000 --- a/libs/eigen/cmake/EigenDetermineOSVersion.cmake +++ /dev/null @@ -1,46 +0,0 @@ -# The utility function DetermineOSVersion aims at providing an -# improved version of the CMake variable ${CMAKE_SYSTEM} on Windows -# machines. -# -# Usage: -# include(EigenDetermineOSVersion) -# DetermineOSVersion(OS_VERSION) -# message("OS: ${OS_VERSION}") - -# - A little helper variable which should not be directly called -function(DetermineShortWindowsName WIN_VERSION win_num_version) - if (${win_num_version} VERSION_EQUAL "6.1") - set(_version "win7") - elseif(${win_num_version} VERSION_EQUAL "6.0") - set(_version "winVista") - elseif(${win_num_version} VERSION_EQUAL "5.2") - set(_version "winXpProf") - elseif(${win_num_version} VERSION_EQUAL "5.1") - set(_version "winXp") - elseif(${win_num_version} VERSION_EQUAL "5.0") - set(_version "win2000Prof") - else() - set(_version "unknownWin") - endif() - set(${WIN_VERSION} ${_version} PARENT_SCOPE) -endfunction() - -function(DetermineOSVersion OS_VERSION) - if (WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows) - file (TO_NATIVE_PATH "$ENV{COMSPEC}" SHELL) - exec_program( ${SHELL} ARGS "/c" "ver" OUTPUT_VARIABLE ver_output) - - string(REGEX MATCHALL "[0-9]+" - ver_list "${ver_output}") - list(GET ver_list 0 _major) - list(GET ver_list 1 _minor) - - set(win_num_version ${_major}.${_minor}) - DetermineShortWindowsName(win_version "${win_num_version}") - if(win_version) - set(${OS_VERSION} ${win_version} PARENT_SCOPE) - endif() - else() - set(${OS_VERSION} ${CMAKE_SYSTEM} PARENT_SCOPE) - endif() -endfunction() diff --git a/libs/eigen/cmake/EigenDetermineVSServicePack.cmake b/libs/eigen/cmake/EigenDetermineVSServicePack.cmake deleted file mode 100644 index fed7819..0000000 --- a/libs/eigen/cmake/EigenDetermineVSServicePack.cmake +++ /dev/null @@ -1,41 +0,0 @@ -include(CMakeDetermineVSServicePack) - -# The code is almost identical to the CMake version. The only difference is that we remove -# _DetermineVSServicePack_FastCheckVersionWithCompiler which lead to errors on some systems. -function(EigenDetermineVSServicePack _pack) - if(NOT DETERMINED_VS_SERVICE_PACK OR NOT ${_pack}) - if(NOT DETERMINED_VS_SERVICE_PACK) - _DetermineVSServicePack_CheckVersionWithTryCompile(DETERMINED_VS_SERVICE_PACK _cl_version) - if(NOT DETERMINED_VS_SERVICE_PACK) - _DetermineVSServicePack_CheckVersionWithTryRun(DETERMINED_VS_SERVICE_PACK _cl_version) - endif() - endif() - - if(DETERMINED_VS_SERVICE_PACK) - if(_cl_version) - # Call helper function to determine VS version - _DetermineVSServicePackFromCompiler(_sp "${_cl_version}") - - # temporary fix, until CMake catches up - if (NOT _sp) - if(${_cl_version} VERSION_EQUAL "17.00.50727.1") - set(_sp "vc110") - elseif(${_cl_version} VERSION_EQUAL "17.00.51106.1") - set(_sp "vc110sp1") - elseif(${_cl_version} VERSION_EQUAL "17.00.60315.1") - set(_sp "vc110sp2") - elseif(${_cl_version} VERSION_EQUAL "17.00.60610.1") - set(_sp "vc110sp3") - else() - set(_sp ${CMAKE_CXX_COMPILER_VERSION}) - endif() - endif() - - if(_sp) - set(${_pack} ${_sp} CACHE INTERNAL - "The Visual Studio Release with Service Pack") - endif() - endif() - endif() - endif() -endfunction() diff --git a/libs/eigen/cmake/EigenSmokeTestList.cmake b/libs/eigen/cmake/EigenSmokeTestList.cmake index 6f0f724..db7d3ff 100644 --- a/libs/eigen/cmake/EigenSmokeTestList.cmake +++ b/libs/eigen/cmake/EigenSmokeTestList.cmake @@ -61,6 +61,9 @@ set(ei_smoke_test_list mapped_matrix_1 mapstaticmethods_1 mapstride_1 + unaryviewstride_1 + unaryviewstride_2 + unaryviewstride_3 matrix_square_root_1 meta minres_2 @@ -100,6 +103,7 @@ set(ei_smoke_test_list sizeof sizeoverflow smallvectors + sparse_basic_1 sparse_basic_3 sparse_block_1 sparse_extra_4 @@ -128,4 +132,5 @@ set(ei_smoke_test_list unalignedassert unalignedcount vectorwiseop_1 - visitor_1) \ No newline at end of file + visitor_1 + vectorization_logic_1) diff --git a/libs/eigen/cmake/EigenTesting.cmake b/libs/eigen/cmake/EigenTesting.cmake index eb8457d..1ddaa12 100644 --- a/libs/eigen/cmake/EigenTesting.cmake +++ b/libs/eigen/cmake/EigenTesting.cmake @@ -23,10 +23,14 @@ macro(ei_add_test_internal testname testname_with_suffix) set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n") set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}") + set(is_gpu_test OFF) if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) + set(is_gpu_test ON) if(EIGEN_TEST_HIP) hip_reset_flags() - hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}") + hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14) + target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP) + set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx1030) elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) @@ -36,54 +40,46 @@ macro(ei_add_test_internal testname testname_with_suffix) link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib") endif() - if (${ARGC} GREATER 2) - add_executable(${targetname} ${filename}) - else() - add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) - endif() + add_executable(${targetname} ${filename}) set(CUDA_CLANG_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread") if (CMAKE_SYSTEM_NAME STREQUAL "Linux") set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt") endif() target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES}) else() - if (${ARGC} GREATER 2) - cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) - else() - cuda_add_executable(${targetname} ${filename}) - endif() + cuda_add_executable(${targetname} ${filename}) endif() else() add_executable(${targetname} ${filename}) endif() - if (targetname MATCHES "^eigen2_") - add_dependencies(eigen2_buildtests ${targetname}) - else() - add_dependencies(buildtests ${targetname}) + add_dependencies(buildtests ${targetname}) + + if (is_gpu_test) + add_dependencies(buildtests_gpu ${targetname}) endif() if(EIGEN_NO_ASSERTION_CHECKING) - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1") + target_compile_definitions(${targetname} PRIVATE EIGEN_NO_ASSERTION_CHECKING=1) else() if(EIGEN_DEBUG_ASSERTS) - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1") + target_compile_definitions(${targetname} PRIVATE EIGEN_DEBUG_ASSERTS=1) endif() endif() - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") + target_compile_definitions(${targetname} PRIVATE EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}) if(MSVC) - ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") + target_compile_options(${targetname} PRIVATE "/bigobj") endif() # let the user pass flags. if(${ARGC} GREATER 2) - ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}") + target_compile_options(${targetname} PRIVATE ${ARGV2}) endif() if(EIGEN_TEST_CUSTOM_CXX_FLAGS) - ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") + target_compile_options(${targetname} PRIVATE ${EIGEN_TEST_CUSTOM_CXX_FLAGS}) endif() if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) @@ -118,21 +114,14 @@ macro(ei_add_test_internal testname testname_with_suffix) add_dependencies("Build${current_subproject}" ${targetname}) set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}") endif() + if (is_gpu_test) + # Add gpu tag for testing only GPU tests. + set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu") + endif() + if(EIGEN_SYCL) # Force include of the SYCL file at the end to avoid errors. set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1) - # Set COMPILE_FLAGS to COMPILE_DEFINITIONS instead to avoid having to duplicate the flags - # to the device compiler. - get_target_property(target_compile_flags ${targetname} COMPILE_FLAGS) - separate_arguments(target_compile_flags) - foreach(flag ${target_compile_flags}) - if(${flag} MATCHES "^-D.*") - string(REPLACE "-D" "" definition_flag ${flag}) - set_property(TARGET ${targetname} APPEND PROPERTY COMPILE_DEFINITIONS ${definition_flag}) - list(REMOVE_ITEM target_compile_flags ${flag}) - endif() - endforeach() - set_property(TARGET ${targetname} PROPERTY COMPILE_FLAGS ${target_compile_flags}) # Link against pthread and add sycl to target set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) @@ -209,12 +198,13 @@ macro(ei_add_test testname) if( (EIGEN_SPLIT_LARGE_TESTS AND suffixes) OR explicit_suffixes) add_custom_target(${testname}) foreach(suffix ${suffixes}) - ei_add_test_internal(${testname} ${testname}_${suffix} - "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}") + ei_add_test_internal(${testname} ${testname}_${suffix} "${ARGV1}" "${ARGV2}") add_dependencies(${testname} ${testname}_${suffix}) + target_compile_definitions(${testname}_${suffix} PRIVATE -DEIGEN_TEST_PART_${suffix}=1) endforeach() else() - ei_add_test_internal(${testname} ${testname} "${ARGV1} -DEIGEN_TEST_PART_ALL=1" "${ARGV2}") + ei_add_test_internal(${testname} ${testname} "${ARGV1}" "${ARGV2}") + target_compile_definitions(${testname} PRIVATE -DEIGEN_TEST_PART_ALL=1) endif() endmacro() @@ -375,12 +365,6 @@ macro(ei_testing_print_summary) message(STATUS "S390X ZVECTOR: Using architecture defaults") endif() - if(EIGEN_TEST_CXX11) - message(STATUS "C++11: ON") - else() - message(STATUS "C++11: OFF") - endif() - if(EIGEN_TEST_SYCL) if(EIGEN_SYCL_TRISYCL) message(STATUS "SYCL: ON (using triSYCL)") @@ -455,15 +439,7 @@ endmacro() macro(ei_get_compilerver VAR) if(MSVC) - # on windows system, we use a modified CMake script - include(EigenDetermineVSServicePack) - EigenDetermineVSServicePack( my_service_pack ) - - if( my_service_pack ) - set(${VAR} ${my_service_pack}) - else() - set(${VAR} "na") - endif() + set(${VAR} "${CMAKE_CXX_COMPILER_VERSION}") elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI") set(${VAR} "${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}") else() @@ -598,10 +574,7 @@ macro(ei_set_build_string) ei_get_compilerver(LOCAL_COMPILER_VERSION) ei_get_cxxflags(LOCAL_COMPILER_FLAGS) - include(EigenDetermineOSVersion) - DetermineOSVersion(OS_VERSION) - - set(TMP_BUILD_STRING ${OS_VERSION}-${LOCAL_COMPILER_VERSION}) + set(TMP_BUILD_STRING ${CMAKE_SYSTEM}-${LOCAL_COMPILER_VERSION}) if (NOT ${LOCAL_COMPILER_FLAGS} STREQUAL "") set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS}) @@ -618,10 +591,6 @@ macro(ei_set_build_string) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit) endif() - if(EIGEN_TEST_CXX11) - set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11) - endif() - if(EIGEN_BUILD_STRING_SUFFIX) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX}) endif() @@ -671,8 +640,8 @@ endmacro() # Split all tests listed in EIGEN_TESTS_LIST into num_splits many targets # named buildtestspartN with N = { 0, ..., num_splits-1}. # -# The intention behind the existance of this macro is the size of Eigen's -# testsuite. Together with the relativly big compile-times building all tests +# The intention behind the existence of this macro is the size of Eigen's +# testsuite. Together with the relatively big compile-times building all tests # can take a substantial amount of time depending on the available hardware. # # The last buildtestspartN target will build possible remaining tests. @@ -775,8 +744,7 @@ macro(ei_add_smoke_tests smoke_test_list) if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST) add_dependencies("${buildtarget}" "${test}") # Add label smoketest to be able to run smoketests using ctest - get_property(test_labels TEST ${test} PROPERTY LABELS) - set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest") + set_property(TEST ${test} APPEND PROPERTY LABELS "smoketest") endif() endforeach() endmacro(ei_add_smoke_tests) diff --git a/libs/eigen/cmake/FindAccelerate.cmake b/libs/eigen/cmake/FindAccelerate.cmake new file mode 100644 index 0000000..787c31c --- /dev/null +++ b/libs/eigen/cmake/FindAccelerate.cmake @@ -0,0 +1,28 @@ +if (Accelerate_INCLUDES AND Accelerate_LIBRARIES) + set(Accelerate_FIND_QUIETLY TRUE) +endif () + +find_path(Accelerate_INCLUDES + NAMES + Accelerate.h + PATHS $ENV{ACCELERATEDIR} +) + +find_library(Accelerate_LIBRARIES Accelerate PATHS $ENV{ACCELERATEDIR}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Accelerate DEFAULT_MSG + Accelerate_INCLUDES Accelerate_LIBRARIES) + +if (Accelerate_FOUND) + get_filename_component(Accelerate_PARENTDIR ${Accelerate_INCLUDES} DIRECTORY) + + file(GLOB_RECURSE SparseHeader ${Accelerate_PARENTDIR}/Sparse.h) + + if ("${SparseHeader}" STREQUAL "") + message(STATUS "Accelerate sparse matrix support was not found. Accelerate has been disabled.") + set(Accelerate_FOUND FALSE) + endif () +endif () + +mark_as_advanced(Accelerate_INCLUDES Accelerate_LIBRARIES) diff --git a/libs/eigen/cmake/FindBLAS.cmake b/libs/eigen/cmake/FindBLAS.cmake deleted file mode 100644 index 1bb8f19..0000000 --- a/libs/eigen/cmake/FindBLAS.cmake +++ /dev/null @@ -1,1407 +0,0 @@ -### -# -# @copyright (c) 2009-2014 The University of Tennessee and The University -# of Tennessee Research Foundation. -# All rights reserved. -# @copyright (c) 2012-2016 Inria. All rights reserved. -# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. -# -### -# -# - Find BLAS library -# This module finds an installed fortran library that implements the BLAS -# linear-algebra interface (see http://www.netlib.org/blas/). -# The list of libraries searched for is taken -# from the autoconf macro file, acx_blas.m4 (distributed at -# http://ac-archive.sourceforge.net/ac-archive/acx_blas.html). -# -# This module sets the following variables: -# BLAS_FOUND - set to true if a library implementing the BLAS interface -# is found -# BLAS_LINKER_FLAGS - uncached list of required linker flags (excluding -l -# and -L). -# BLAS_COMPILER_FLAGS - uncached list of required compiler flags (including -I for mkl headers). -# BLAS_LIBRARIES - uncached list of libraries (using full path name) to -# link against to use BLAS -# BLAS95_LIBRARIES - uncached list of libraries (using full path name) -# to link against to use BLAS95 interface -# BLAS95_FOUND - set to true if a library implementing the BLAS f95 interface -# is found -# BLA_STATIC if set on this determines what kind of linkage we do (static) -# BLA_VENDOR if set checks only the specified vendor, if not set checks -# all the possibilities -# BLAS_VENDOR_FOUND stores the BLAS vendor found -# BLA_F95 if set on tries to find the f95 interfaces for BLAS/LAPACK -# The user can give specific paths where to find the libraries adding cmake -# options at configure (ex: cmake path/to/project -DBLAS_DIR=path/to/blas): -# BLAS_DIR - Where to find the base directory of blas -# BLAS_INCDIR - Where to find the header files -# BLAS_LIBDIR - Where to find the library files -# The module can also look for the following environment variables if paths -# are not given as cmake variable: BLAS_DIR, BLAS_INCDIR, BLAS_LIBDIR -# For MKL case and if no paths are given as hints, we will try to use the MKLROOT -# environment variable -# BLAS_VERBOSE Print some additional information during BLAS libraries detection -########## -### List of vendors (BLA_VENDOR) valid in this module -########## List of vendors (BLA_VENDOR) valid in this module -## Open (for OpenBlas), Eigen (for EigenBlas), Goto, ATLAS PhiPACK, -##  CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT -## Intel10_32 (intel mkl v10 32 bit), Intel10_64lp (intel mkl v10 64 bit,lp thread model, lp64 model), -## Intel10_64lp_seq (intel mkl v10 64 bit,sequential code, lp64 model), -## Intel( older versions of mkl 32 and 64 bit), -##  ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic -# C/CXX should be enabled to use Intel mkl -### -# We handle different modes to find the dependency -# -# - Detection if already installed on the system -# - BLAS libraries can be detected from different ways -# Here is the order of precedence: -# 1) we look in cmake variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined -# 2) we look in environment variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined -# 3) we look in common environnment variables depending on the system (INCLUDE, C_INCLUDE_PATH, CPATH - LIB, DYLD_LIBRARY_PATH, LD_LIBRARY_PATH) -# 4) we look in common system paths depending on the system, see for example paths contained in the following cmake variables: -# - CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES -# - CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_C_IMPLICIT_LINK_DIRECTORIES -# - -#============================================================================= -# Copyright 2007-2009 Kitware, Inc. -# -# Distributed under the OSI-approved BSD License (the "License"); -# see accompanying file Copyright.txt for details. -# -# This software is distributed WITHOUT ANY WARRANTY; without even the -# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# See the License for more information. -#============================================================================= -# (To distribute this file outside of CMake, substitute the full -# License text for the above reference.) - -## Some macros to print status when search for headers and libs -# This macro informs why the _lib_to_find file has not been found -macro(Print_Find_Library_Blas_Status _libname _lib_to_find) - - # save _libname upper/lower case - string(TOUPPER ${_libname} LIBNAME) - string(TOLOWER ${_libname} libname) - - # print status - #message(" ") - if(${LIBNAME}_LIBDIR) - message("${Yellow}${LIBNAME}_LIBDIR is defined but ${_lib_to_find}" - "has not been found in ${ARGN}${ColourReset}") - else() - if(${LIBNAME}_DIR) - message("${Yellow}${LIBNAME}_DIR is defined but ${_lib_to_find}" - "has not been found in ${ARGN}${ColourReset}") - else() - message("${Yellow}${_lib_to_find} not found." - "Nor ${LIBNAME}_DIR neither ${LIBNAME}_LIBDIR" - "are defined so that we look for ${_lib_to_find} in" - "system paths (Linux: LD_LIBRARY_PATH, Windows: LIB," - "Mac: DYLD_LIBRARY_PATH," - "CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES," - "CMAKE_C_IMPLICIT_LINK_DIRECTORIES)${ColourReset}") - if(_lib_env) - message("${Yellow}${_lib_to_find} has not been found in" - "${_lib_env}${ColourReset}") - endif() - endif() - endif() - message("${BoldYellow}Please indicate where to find ${_lib_to_find}. You have three options:\n" - "- Option 1: Provide the Installation directory of BLAS library with cmake option: -D${LIBNAME}_DIR=your/path/to/${libname}/\n" - "- Option 2: Provide the directory where to find the library with cmake option: -D${LIBNAME}_LIBDIR=your/path/to/${libname}/lib/\n" - "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" - "- Option 4: If your library provides a PkgConfig file, make sure pkg-config finds your library${ColourReset}") - -endmacro() - -# This macro informs why the _lib_to_find file has not been found -macro(Print_Find_Library_Blas_CheckFunc_Status _name) - - # save _libname upper/lower case - string(TOUPPER ${_name} FUNCNAME) - string(TOLOWER ${_name} funcname) - - # print status - #message(" ") - message("${Red}Libs have been found but check of symbol ${_name} failed " - "with following libraries ${ARGN}${ColourReset}") - message("${BoldRed}Please open your error file CMakeFiles/CMakeError.log" - "to figure out why it fails${ColourReset}") - #message(" ") - -endmacro() - -if (NOT BLAS_FOUND) - set(BLAS_DIR "" CACHE PATH "Installation directory of BLAS library") - if (NOT BLAS_FIND_QUIETLY) - message(STATUS "A cache variable, namely BLAS_DIR, has been set to specify the install directory of BLAS") - endif() -endif() - -option(BLAS_VERBOSE "Print some additional information during BLAS libraries detection" OFF) -mark_as_advanced(BLAS_VERBOSE) - -include(CheckFunctionExists) -include(CheckFortranFunctionExists) -include(CMakeFindDependencyMacro) - -set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) - -# Check the language being used -get_property( _LANGUAGES_ GLOBAL PROPERTY ENABLED_LANGUAGES ) -if( _LANGUAGES_ MATCHES Fortran AND CMAKE_Fortran_COMPILER) - set( _CHECK_FORTRAN TRUE ) -elseif( (_LANGUAGES_ MATCHES C) OR (_LANGUAGES_ MATCHES CXX) ) - set( _CHECK_FORTRAN FALSE ) -else() - if(BLAS_FIND_REQUIRED) - message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.") - else() - message(STATUS "Looking for BLAS... - NOT found (Unsupported languages)") - return() - endif() -endif() - -macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread) - # This macro checks for the existence of the combination of fortran libraries - # given by _list. If the combination is found, this macro checks (using the - # Check_Fortran_Function_Exists macro) whether can link against that library - # combination using the name of a routine given by _name using the linker - # flags given by _flags. If the combination of libraries is found and passes - # the link test, LIBRARIES is set to the list of complete library paths that - # have been found. Otherwise, LIBRARIES is set to FALSE. - - # N.B. _prefix is the prefix applied to the names of all cached variables that - # are generated internally and marked advanced by this macro. - - set(_libdir ${ARGN}) - - set(_libraries_work TRUE) - set(${LIBRARIES}) - set(_combined_name) - set(ENV_MKLROOT "$ENV{MKLROOT}") - set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") - set(ENV_BLAS_LIBDIR "$ENV{BLAS_LIBDIR}") - if (NOT _libdir) - if (BLAS_LIBDIR) - list(APPEND _libdir "${BLAS_LIBDIR}") - elseif (BLAS_DIR) - list(APPEND _libdir "${BLAS_DIR}") - list(APPEND _libdir "${BLAS_DIR}/lib") - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - list(APPEND _libdir "${BLAS_DIR}/lib64") - list(APPEND _libdir "${BLAS_DIR}/lib/intel64") - else() - list(APPEND _libdir "${BLAS_DIR}/lib32") - list(APPEND _libdir "${BLAS_DIR}/lib/ia32") - endif() - elseif(ENV_BLAS_LIBDIR) - list(APPEND _libdir "${ENV_BLAS_LIBDIR}") - elseif(ENV_BLAS_DIR) - list(APPEND _libdir "${ENV_BLAS_DIR}") - list(APPEND _libdir "${ENV_BLAS_DIR}/lib") - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - list(APPEND _libdir "${ENV_BLAS_DIR}/lib64") - list(APPEND _libdir "${ENV_BLAS_DIR}/lib/intel64") - else() - list(APPEND _libdir "${ENV_BLAS_DIR}/lib32") - list(APPEND _libdir "${ENV_BLAS_DIR}/lib/ia32") - endif() - else() - if (ENV_MKLROOT) - list(APPEND _libdir "${ENV_MKLROOT}/lib") - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - list(APPEND _libdir "${ENV_MKLROOT}/lib64") - list(APPEND _libdir "${ENV_MKLROOT}/lib/intel64") - else() - list(APPEND _libdir "${ENV_MKLROOT}/lib32") - list(APPEND _libdir "${ENV_MKLROOT}/lib/ia32") - endif() - endif() - if (WIN32) - string(REPLACE ":" ";" _libdir2 "$ENV{LIB}") - elseif (APPLE) - string(REPLACE ":" ";" _libdir2 "$ENV{DYLD_LIBRARY_PATH}") - else () - string(REPLACE ":" ";" _libdir2 "$ENV{LD_LIBRARY_PATH}") - endif () - list(APPEND _libdir "${_libdir2}") - list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") - list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") - endif() - endif () - - if (BLAS_VERBOSE) - message("${Cyan}Try to find BLAS libraries: ${_list}") - endif () - - foreach(_library ${_list}) - set(_combined_name ${_combined_name}_${_library}) - - if(_libraries_work) - if (BLA_STATIC) - if (WIN32) - set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) - endif () - if (APPLE) - set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) - else () - set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) - endif () - else () - if (CMAKE_SYSTEM_NAME STREQUAL "Linux") - # for ubuntu's libblas3gf and liblapack3gf packages - set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf) - endif () - endif () - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - HINTS ${_libdir} - NO_DEFAULT_PATH - ) - mark_as_advanced(${_prefix}_${_library}_LIBRARY) - # Print status if not found - # ------------------------- - if (NOT ${_prefix}_${_library}_LIBRARY AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) - Print_Find_Library_Blas_Status(blas ${_library} ${_libdir}) - endif () - set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) - set(_libraries_work ${${_prefix}_${_library}_LIBRARY}) - endif() - endforeach() - - if(_libraries_work) - # Test this combination of libraries. - if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC) - list(INSERT ${LIBRARIES} 0 "-Wl,--start-group") - list(APPEND ${LIBRARIES} "-Wl,--end-group") - endif() - set(CMAKE_REQUIRED_LIBRARIES "${_flags};${${LIBRARIES}};${_thread}") - set(CMAKE_REQUIRED_FLAGS "${BLAS_COMPILER_FLAGS}") - if (BLAS_VERBOSE) - message("${Cyan}BLAS libs found for BLA_VENDOR ${BLA_VENDOR}." - "Try to compile symbol ${_name} with following libraries:" - "${CMAKE_REQUIRED_LIBRARIES}") - endif () - if(NOT BLAS_FOUND) - unset(${_prefix}${_combined_name}_WORKS CACHE) - endif() - if (_CHECK_FORTRAN) - if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") - string(REPLACE "mkl_intel_lp64" "mkl_gf_lp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") - string(REPLACE "mkl_intel_ilp64" "mkl_gf_ilp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") - endif() - check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS) - else() - check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS) - endif() - mark_as_advanced(${_prefix}${_combined_name}_WORKS) - set(_libraries_work ${${_prefix}${_combined_name}_WORKS}) - # Print status if not found - # ------------------------- - if (NOT _libraries_work AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) - Print_Find_Library_Blas_CheckFunc_Status(${_name} ${CMAKE_REQUIRED_LIBRARIES}) - endif () - set(CMAKE_REQUIRED_LIBRARIES) - endif() - - if(_libraries_work) - set(${LIBRARIES} ${${LIBRARIES}} ${_thread}) - else() - set(${LIBRARIES} FALSE) - endif() - -endmacro() - - -set(BLAS_LINKER_FLAGS) -set(BLAS_LIBRARIES) -set(BLAS95_LIBRARIES) -if ($ENV{BLA_VENDOR} MATCHES ".+") - set(BLA_VENDOR $ENV{BLA_VENDOR}) -else () - if(NOT BLA_VENDOR) - set(BLA_VENDOR "All") - endif() -endif () - -#BLAS in intel mkl 10 library? (em64t 64bit) -if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*") - # Looking for include - # ------------------- - - # Add system include paths to search include - # ------------------------------------------ - unset(_inc_env) - set(ENV_MKLROOT "$ENV{MKLROOT}") - set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") - set(ENV_BLAS_INCDIR "$ENV{BLAS_INCDIR}") - if(ENV_BLAS_INCDIR) - list(APPEND _inc_env "${ENV_BLAS_INCDIR}") - elseif(ENV_BLAS_DIR) - list(APPEND _inc_env "${ENV_BLAS_DIR}") - list(APPEND _inc_env "${ENV_BLAS_DIR}/include") - else() - if (ENV_MKLROOT) - list(APPEND _inc_env "${ENV_MKLROOT}/include") - endif() - # system variables - if(WIN32) - string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") - list(APPEND _inc_env "${_path_env}") - else() - string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") - list(APPEND _inc_env "${_path_env}") - string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") - list(APPEND _inc_env "${_path_env}") - string(REPLACE ":" ";" _path_env "$ENV{CPATH}") - list(APPEND _inc_env "${_path_env}") - string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") - list(APPEND _inc_env "${_path_env}") - endif() - endif() - list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") - list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") - list(REMOVE_DUPLICATES _inc_env) - - # set paths where to look for - set(PATH_TO_LOOK_FOR "${_inc_env}") - - # Try to find the fftw header in the given paths - # ------------------------------------------------- - # call cmake macro to find the header path - if(BLAS_INCDIR) - set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") - find_path(BLAS_mkl.h_DIRS - NAMES mkl.h - HINTS ${BLAS_INCDIR}) - else() - if(BLAS_DIR) - set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") - find_path(BLAS_mkl.h_DIRS - NAMES mkl.h - HINTS ${BLAS_DIR} - PATH_SUFFIXES "include") - else() - set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") - find_path(BLAS_mkl.h_DIRS - NAMES mkl.h - HINTS ${PATH_TO_LOOK_FOR}) - endif() - endif() - mark_as_advanced(BLAS_mkl.h_DIRS) - - # If found, add path to cmake variable - # ------------------------------------ - if (BLAS_mkl.h_DIRS) - set(BLAS_INCLUDE_DIRS "${BLAS_mkl.h_DIRS}") - else () - set(BLAS_INCLUDE_DIRS "BLAS_INCLUDE_DIRS-NOTFOUND") - if(NOT BLAS_FIND_QUIETLY) - message(STATUS "Looking for BLAS -- mkl.h not found") - endif() - endif() - - if (WIN32) - string(REPLACE ":" ";" _libdir "$ENV{LIB}") - elseif (APPLE) - string(REPLACE ":" ";" _libdir "$ENV{DYLD_LIBRARY_PATH}") - else () - string(REPLACE ":" ";" _libdir "$ENV{LD_LIBRARY_PATH}") - endif () - list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") - list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") - # libiomp5 - # -------- - set(OMP_iomp5_LIBRARY "OMP_iomp5_LIBRARY-NOTFOUND") - find_library(OMP_iomp5_LIBRARY - NAMES iomp5 - HINTS ${_libdir} - ) - mark_as_advanced(OMP_iomp5_LIBRARY) - set(OMP_LIB "") - # libgomp - # ------- - set(OMP_gomp_LIBRARY "OMP_gomp_LIBRARY-NOTFOUND") - find_library(OMP_gomp_LIBRARY - NAMES gomp - HINTS ${_libdir} - ) - mark_as_advanced(OMP_gomp_LIBRARY) - # choose one or another depending on the compilo - if (CMAKE_C_COMPILER_ID STREQUAL "GNU") - if (OMP_gomp_LIBRARY) - set(OMP_LIB "${OMP_gomp_LIBRARY}") - endif() - else() - if (OMP_iomp5_LIBRARY) - set(OMP_LIB "${OMP_iomp5_LIBRARY}") - endif() - endif() - - if (UNIX AND NOT WIN32) - # m - find_library(M_LIBRARY - NAMES m - HINTS ${_libdir}) - mark_as_advanced(M_LIBRARY) - if(M_LIBRARY) - set(LM "-lm") - else() - set(LM "") - endif() - # Fortran - set(LGFORTRAN "") - if (CMAKE_C_COMPILER_ID MATCHES "GNU") - find_library( - FORTRAN_gfortran_LIBRARY - NAMES gfortran - HINTS ${_libdir} - ) - mark_as_advanced(FORTRAN_gfortran_LIBRARY) - if (FORTRAN_gfortran_LIBRARY) - set(LGFORTRAN "${FORTRAN_gfortran_LIBRARY}") - endif() - elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") - find_library( - FORTRAN_ifcore_LIBRARY - NAMES ifcore - HINTS ${_libdir} - ) - mark_as_advanced(FORTRAN_ifcore_LIBRARY) - if (FORTRAN_ifcore_LIBRARY) - set(LGFORTRAN "{FORTRAN_ifcore_LIBRARY}") - endif() - endif() - set(BLAS_COMPILER_FLAGS "") - if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") - if (CMAKE_C_COMPILER_ID STREQUAL "Intel") - list(APPEND BLAS_COMPILER_FLAGS "-openmp") - endif() - if (CMAKE_C_COMPILER_ID STREQUAL "GNU") - list(APPEND BLAS_COMPILER_FLAGS "-fopenmp") - endif() - endif() - if (CMAKE_C_COMPILER_ID STREQUAL "GNU") - if (BLA_VENDOR STREQUAL "Intel10_32") - list(APPEND BLAS_COMPILER_FLAGS "-m32") - else() - list(APPEND BLAS_COMPILER_FLAGS "-m64") - endif() - if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") - list(APPEND OMP_LIB "-ldl") - endif() - if (ENV_MKLROOT) - list(APPEND BLAS_COMPILER_FLAGS "-I${ENV_MKLROOT}/include") - endif() - endif() - - set(additional_flags "") - if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") - set(additional_flags "-Wl,--no-as-needed") - endif() - endif () - - if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) - if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED) - find_dependency(Threads) - else() - find_dependency(Threads REQUIRED) - endif() - - set(BLAS_SEARCH_LIBS "") - - if(BLA_F95) - - set(BLAS_mkl_SEARCH_SYMBOL SGEMM) - set(_LIBRARIES BLAS95_LIBRARIES) - if (WIN32) - if (BLA_STATIC) - set(BLAS_mkl_DLL_SUFFIX "") - else() - set(BLAS_mkl_DLL_SUFFIX "_dll") - endif() - - # Find the main file (32-bit or 64-bit) - set(BLAS_SEARCH_LIBS_WIN_MAIN "") - if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN - "mkl_blas95${BLAS_mkl_DLL_SUFFIX} mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") - endif() - if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN - "mkl_blas95_lp64${BLAS_mkl_DLL_SUFFIX} mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") - endif () - - # Add threading/sequential libs - set(BLAS_SEARCH_LIBS_WIN_THREAD "") - if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD - "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") - endif() - if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") - # old version - list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD - "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") - # mkl >= 10.3 - list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD - "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") - endif() - - # Cartesian product of the above - foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) - foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) - list(APPEND BLAS_SEARCH_LIBS - "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") - endforeach() - endforeach() - else () - if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS - "mkl_blas95 mkl_intel mkl_intel_thread mkl_core guide") - endif () - if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") - # old version - list(APPEND BLAS_SEARCH_LIBS - "mkl_blas95 mkl_intel_lp64 mkl_intel_thread mkl_core guide") - # mkl >= 10.3 - if (CMAKE_C_COMPILER_ID STREQUAL "Intel") - list(APPEND BLAS_SEARCH_LIBS - "mkl_blas95_lp64 mkl_intel_lp64 mkl_intel_thread mkl_core") - endif() - if (CMAKE_C_COMPILER_ID STREQUAL "GNU") - list(APPEND BLAS_SEARCH_LIBS - "mkl_blas95_lp64 mkl_intel_lp64 mkl_gnu_thread mkl_core") - endif() - endif () - if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS - "mkl_intel_lp64 mkl_sequential mkl_core") - if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") - set(OMP_LIB "") - endif() - endif () - endif () - - else () - - set(BLAS_mkl_SEARCH_SYMBOL sgemm) - set(_LIBRARIES BLAS_LIBRARIES) - if (WIN32) - if (BLA_STATIC) - set(BLAS_mkl_DLL_SUFFIX "") - else() - set(BLAS_mkl_DLL_SUFFIX "_dll") - endif() - - # Find the main file (32-bit or 64-bit) - set(BLAS_SEARCH_LIBS_WIN_MAIN "") - if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN - "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") - endif() - if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN - "mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") - endif () - - # Add threading/sequential libs - set(BLAS_SEARCH_LIBS_WIN_THREAD "") - if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") - # old version - list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD - "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") - # mkl >= 10.3 - list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD - "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") - endif() - if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD - "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") - endif() - - # Cartesian product of the above - foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) - foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) - list(APPEND BLAS_SEARCH_LIBS - "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") - endforeach() - endforeach() - else () - if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS - "mkl_intel mkl_intel_thread mkl_core guide") - endif () - if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") - # old version - list(APPEND BLAS_SEARCH_LIBS - "mkl_intel_lp64 mkl_intel_thread mkl_core guide") - # mkl >= 10.3 - if (CMAKE_C_COMPILER_ID STREQUAL "Intel") - list(APPEND BLAS_SEARCH_LIBS - "mkl_intel_lp64 mkl_intel_thread mkl_core") - endif() - if (CMAKE_C_COMPILER_ID STREQUAL "GNU") - list(APPEND BLAS_SEARCH_LIBS - "mkl_intel_lp64 mkl_gnu_thread mkl_core") - endif() - endif () - if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS - "mkl_intel_lp64 mkl_sequential mkl_core") - if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") - set(OMP_LIB "") - endif() - endif () - #older vesions of intel mkl libs - if (BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All") - list(APPEND BLAS_SEARCH_LIBS - "mkl") - list(APPEND BLAS_SEARCH_LIBS - "mkl_ia32") - list(APPEND BLAS_SEARCH_LIBS - "mkl_em64t") - endif () - endif () - - endif () - - foreach (IT ${BLAS_SEARCH_LIBS}) - string(REPLACE " " ";" SEARCH_LIBS ${IT}) - if (${_LIBRARIES}) - else () - check_fortran_libraries( - ${_LIBRARIES} - BLAS - ${BLAS_mkl_SEARCH_SYMBOL} - "${additional_flags}" - "${SEARCH_LIBS}" - "${OMP_LIB};${CMAKE_THREAD_LIBS_INIT};${LM}" - ) - if(_LIBRARIES) - set(BLAS_LINKER_FLAGS "${additional_flags}") - endif() - endif() - endforeach () - if(NOT BLAS_FIND_QUIETLY) - if(${_LIBRARIES}) - message(STATUS "Looking for MKL BLAS: found") - else() - message(STATUS "Looking for MKL BLAS: not found") - endif() - endif() - if (${_LIBRARIES} AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "Intel MKL") - endif() - endif () - endif() -endif () - - -if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - # gotoblas (http://www.tacc.utexas.edu/tacc-projects/gotoblas2) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "goto2" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for Goto BLAS: found") - else() - message(STATUS "Looking for Goto BLAS: not found") - endif() - endif() - endif() - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "Goto") - endif() - -endif () - - -# OpenBlas -if (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - # openblas (http://www.openblas.net/) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "openblas" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for Open BLAS: found") - else() - message(STATUS "Looking for Open BLAS: not found") - endif() - endif() - endif() - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "Openblas") - endif() - -endif () - - -# EigenBlas -if (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "eigen_blas" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - message(STATUS "Looking for Eigen BLAS: found") - else() - message(STATUS "Looking for Eigen BLAS: not found") - endif() - endif() - endif() - - if(NOT BLAS_LIBRARIES) - # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "eigen_blas_static" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for Eigen BLAS: found") - else() - message(STATUS "Looking for Eigen BLAS: not found") - endif() - endif() - endif() - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "Eigen") - endif() - -endif () - - -if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - dgemm - "" - "f77blas;atlas" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for Atlas BLAS: found") - else() - message(STATUS "Looking for Atlas BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "Atlas") - endif() - -endif () - - -# BLAS in PhiPACK libraries? (requires generic BLAS lib, too) -if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "sgemm;dgemm;blas" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for PhiPACK BLAS: found") - else() - message(STATUS "Looking for PhiPACK BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "PhiPACK") - endif() - -endif () - - -# BLAS in Alpha CXML library? -if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "cxml" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for CXML BLAS: found") - else() - message(STATUS "Looking for CXML BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "CXML") - endif() - -endif () - - -# BLAS in Alpha DXML library? (now called CXML, see above) -if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "dxml" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for DXML BLAS: found") - else() - message(STATUS "Looking for DXML BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "DXML") - endif() - -endif () - - -# BLAS in Sun Performance library? -if (BLA_VENDOR STREQUAL "SunPerf" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "-xlic_lib=sunperf" - "sunperf;sunmath" - "" - ) - if(BLAS_LIBRARIES) - set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf") - endif() - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for SunPerf BLAS: found") - else() - message(STATUS "Looking for SunPerf BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "SunPerf") - endif() - -endif () - - -# BLAS in SCSL library? (SGI/Cray Scientific Library) -if (BLA_VENDOR STREQUAL "SCSL" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "scsl" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for SCSL BLAS: found") - else() - message(STATUS "Looking for SCSL BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "SunPerf") - endif() - -endif () - - -# BLAS in SGIMATH library? -if (BLA_VENDOR STREQUAL "SGIMATH" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "complib.sgimath" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for SGIMATH BLAS: found") - else() - message(STATUS "Looking for SGIMATH BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "SGIMATH") - endif() - -endif () - - -# BLAS in IBM ESSL library (requires generic BLAS lib, too) -if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "essl;xlfmath;xlf90_r;blas" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for IBM ESSL BLAS: found") - else() - message(STATUS "Looking for IBM ESSL BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "IBM ESSL") - endif() - -endif () - -# BLAS in IBM ESSL_MT library (requires generic BLAS lib, too) -if (BLA_VENDOR STREQUAL "IBMESSLMT" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "esslsmp;xlsmp;xlfmath;xlf90_r;blas" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for IBM ESSL MT BLAS: found") - else() - message(STATUS "Looking for IBM ESSL MT BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "IBM ESSL MT") - endif() - -endif () - - -#BLAS in acml library? -if (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All") - - if( ((BLA_VENDOR STREQUAL "ACML") AND (NOT BLAS_ACML_LIB_DIRS)) OR - ((BLA_VENDOR STREQUAL "ACML_MP") AND (NOT BLAS_ACML_MP_LIB_DIRS)) OR - ((BLA_VENDOR STREQUAL "ACML_GPU") AND (NOT BLAS_ACML_GPU_LIB_DIRS))) - - # try to find acml in "standard" paths - if( WIN32 ) - file( GLOB _ACML_ROOT "C:/AMD/acml*/ACML-EULA.txt" ) - else() - file( GLOB _ACML_ROOT "/opt/acml*/ACML-EULA.txt" ) - endif() - if( WIN32 ) - file( GLOB _ACML_GPU_ROOT "C:/AMD/acml*/GPGPUexamples" ) - else() - file( GLOB _ACML_GPU_ROOT "/opt/acml*/GPGPUexamples" ) - endif() - list(GET _ACML_ROOT 0 _ACML_ROOT) - list(GET _ACML_GPU_ROOT 0 _ACML_GPU_ROOT) - - if( _ACML_ROOT ) - - get_filename_component( _ACML_ROOT ${_ACML_ROOT} PATH ) - if( SIZEOF_INTEGER EQUAL 8 ) - set( _ACML_PATH_SUFFIX "_int64" ) - else() - set( _ACML_PATH_SUFFIX "" ) - endif() - if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) - set( _ACML_COMPILER32 "ifort32" ) - set( _ACML_COMPILER64 "ifort64" ) - elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" ) - set( _ACML_COMPILER32 "sun32" ) - set( _ACML_COMPILER64 "sun64" ) - elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) - set( _ACML_COMPILER32 "pgi32" ) - if( WIN32 ) - set( _ACML_COMPILER64 "win64" ) - else() - set( _ACML_COMPILER64 "pgi64" ) - endif() - elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Open64" ) - # 32 bit builds not supported on Open64 but for code simplicity - # We'll just use the same directory twice - set( _ACML_COMPILER32 "open64_64" ) - set( _ACML_COMPILER64 "open64_64" ) - elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "NAG" ) - set( _ACML_COMPILER32 "nag32" ) - set( _ACML_COMPILER64 "nag64" ) - else() - set( _ACML_COMPILER32 "gfortran32" ) - set( _ACML_COMPILER64 "gfortran64" ) - endif() - - if( BLA_VENDOR STREQUAL "ACML_MP" ) - set(_ACML_MP_LIB_DIRS - "${_ACML_ROOT}/${_ACML_COMPILER32}_mp${_ACML_PATH_SUFFIX}/lib" - "${_ACML_ROOT}/${_ACML_COMPILER64}_mp${_ACML_PATH_SUFFIX}/lib" ) - else() - set(_ACML_LIB_DIRS - "${_ACML_ROOT}/${_ACML_COMPILER32}${_ACML_PATH_SUFFIX}/lib" - "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" ) - endif() - - endif() - - elseif(BLAS_${BLA_VENDOR}_LIB_DIRS) - - set(_${BLA_VENDOR}_LIB_DIRS ${BLAS_${BLA_VENDOR}_LIB_DIRS}) - - endif() - - if( BLA_VENDOR STREQUAL "ACML_MP" ) - foreach( BLAS_ACML_MP_LIB_DIRS ${_ACML_MP_LIB_DIRS}) - check_fortran_libraries ( - BLAS_LIBRARIES - BLAS - sgemm - "" "acml_mp;acml_mv" "" ${BLAS_ACML_MP_LIB_DIRS} - ) - if( BLAS_LIBRARIES ) - break() - endif() - endforeach() - elseif( BLA_VENDOR STREQUAL "ACML_GPU" ) - foreach( BLAS_ACML_GPU_LIB_DIRS ${_ACML_GPU_LIB_DIRS}) - check_fortran_libraries ( - BLAS_LIBRARIES - BLAS - sgemm - "" "acml;acml_mv;CALBLAS" "" ${BLAS_ACML_GPU_LIB_DIRS} - ) - if( BLAS_LIBRARIES ) - break() - endif() - endforeach() - else() - foreach( BLAS_ACML_LIB_DIRS ${_ACML_LIB_DIRS} ) - check_fortran_libraries ( - BLAS_LIBRARIES - BLAS - sgemm - "" "acml;acml_mv" "" ${BLAS_ACML_LIB_DIRS} - ) - if( BLAS_LIBRARIES ) - break() - endif() - endforeach() - endif() - - # Either acml or acml_mp should be in LD_LIBRARY_PATH but not both - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "acml;acml_mv" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for ACML BLAS: found") - else() - message(STATUS "Looking for ACML BLAS: not found") - endif() - endif() - endif() - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "acml_mp;acml_mv" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for ACML BLAS: found") - else() - message(STATUS "Looking for ACML BLAS: not found") - endif() - endif() - endif() - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "acml;acml_mv;CALBLAS" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for ACML BLAS: found") - else() - message(STATUS "Looking for ACML BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "ACML") - endif() - -endif () # ACML - - -# Apple BLAS library? -if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") - - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - dgemm - "" - "Accelerate" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for Apple BLAS: found") - else() - message(STATUS "Looking for Apple BLAS: not found") - endif() - endif() - endif() - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "Apple Accelerate") - endif() - -endif () - - -if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") - - if ( NOT BLAS_LIBRARIES ) - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - dgemm - "" - "vecLib" - "" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for NAS BLAS: found") - else() - message(STATUS "Looking for NAS BLAS: not found") - endif() - endif() - endif () - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "NAS") - endif() - -endif () - - -# Generic BLAS library? -if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All") - - set(BLAS_SEARCH_LIBS "blas;blas_LINUX;blas_MAC;blas_WINDOWS;refblas") - foreach (SEARCH_LIB ${BLAS_SEARCH_LIBS}) - if (BLAS_LIBRARIES) - else () - check_fortran_libraries( - BLAS_LIBRARIES - BLAS - sgemm - "" - "${SEARCH_LIB}" - "${LGFORTRAN}" - ) - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_LIBRARIES) - message(STATUS "Looking for Generic BLAS: found") - else() - message(STATUS "Looking for Generic BLAS: not found") - endif() - endif() - endif() - endforeach () - - if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) - set (BLAS_VENDOR_FOUND "Netlib or other Generic libblas") - endif() - -endif () - - -if(BLA_F95) - - if(BLAS95_LIBRARIES) - set(BLAS95_FOUND TRUE) - else() - set(BLAS95_FOUND FALSE) - endif() - - if(NOT BLAS_FIND_QUIETLY) - if(BLAS95_FOUND) - message(STATUS "A library with BLAS95 API found.") - message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") - else() - message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas 95 libraries could not be found or check of symbols failed." - "\nPlease indicate where to find blas libraries. You have three options:\n" - "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" - "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" - "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" - "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." - "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." - "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," - "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," - "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") - if(BLAS_FIND_REQUIRED) - message(FATAL_ERROR - "A required library with BLAS95 API not found. Please specify library location.") - else() - message(STATUS - "A library with BLAS95 API not found. Please specify library location.") - endif() - endif() - endif() - - set(BLAS_FOUND TRUE) - set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}") - -else() - - if(BLAS_LIBRARIES) - set(BLAS_FOUND TRUE) - else() - set(BLAS_FOUND FALSE) - endif() - - if(NOT BLAS_FIND_QUIETLY) - if(BLAS_FOUND) - message(STATUS "A library with BLAS API found.") - message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") - else() - message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas libraries could not be found or check of symbols failed." - "\nPlease indicate where to find blas libraries. You have three options:\n" - "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" - "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" - "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" - "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." - "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." - "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," - "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," - "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") - if(BLAS_FIND_REQUIRED) - message(FATAL_ERROR - "A required library with BLAS API not found. Please specify library location.") - else() - message(STATUS - "A library with BLAS API not found. Please specify library location.") - endif() - endif() - endif() - -endif() - -set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) - -if (BLAS_FOUND) - list(GET BLAS_LIBRARIES 0 first_lib) - get_filename_component(first_lib_path "${first_lib}" PATH) - if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)") - string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}") - set(BLAS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of BLAS library" FORCE) - else() - set(BLAS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of BLAS library" FORCE) - endif() -endif() -mark_as_advanced(BLAS_DIR) -mark_as_advanced(BLAS_DIR_FOUND) diff --git a/libs/eigen/cmake/FindCLANG_FORMAT.cmake b/libs/eigen/cmake/FindCLANG_FORMAT.cmake new file mode 100644 index 0000000..e00f19f --- /dev/null +++ b/libs/eigen/cmake/FindCLANG_FORMAT.cmake @@ -0,0 +1,61 @@ + + +# Find clang-format +# +# CLANG_FORMAT_EXECUTABLE - Path to clang-format executable +# CLANG_FORMAT_FOUND - True if the clang-format executable was found. +# CLANG_FORMAT_VERSION - The version of clang-format found +# +# Copyright 2009-2020 The VOTCA Development Team (http://www.votca.org) +# +# Licensed under the Mozilla Public License Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.mozilla.org/en-US/MPL/2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +find_program(CLANG_FORMAT_EXECUTABLE + NAMES + clang-format-9 + clang-format + clang-format-11 + clang-format-10 + clang-format-8 + clang-format-7 + + DOC "clang-format executable") +mark_as_advanced(CLANG_FORMAT_EXECUTABLE) + +# Extract version from command "clang-format -version" +if(CLANG_FORMAT_EXECUTABLE) + execute_process(COMMAND ${CLANG_FORMAT_EXECUTABLE} -version + OUTPUT_VARIABLE clang_format_version + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(clang_format_version MATCHES "^.*clang-format version .*") + # clang_format_version sample: "clang-format version 3.9.1-4ubuntu3~16.04.1 + # (tags/RELEASE_391/rc2)" + string(REGEX + REPLACE "^.*clang-format version ([.0-9]+).*" + "\\1" + CLANG_FORMAT_VERSION + "${clang_format_version}") + # CLANG_FORMAT_VERSION sample: "3.9.1" + else() + set(CLANG_FORMAT_VERSION 0.0) + endif() +else() + set(CLANG_FORMAT_VERSION 0.0) +endif() + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set CLANG_FORMAT_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(CLANG_FORMAT REQUIRED_VARS CLANG_FORMAT_EXECUTABLE VERSION_VAR CLANG_FORMAT_VERSION) diff --git a/libs/eigen/cmake/FindComputeCpp.cmake b/libs/eigen/cmake/FindComputeCpp.cmake index 1c271f0..e200522 100644 --- a/libs/eigen/cmake/FindComputeCpp.cmake +++ b/libs/eigen/cmake/FindComputeCpp.cmake @@ -382,7 +382,7 @@ endfunction(__build_ir) ####################### # # Adds a SYCL compilation custom command associated with an existing -# target and sets a dependancy on that new command. +# target and sets a dependency on that new command. # # TARGET : Name of the target to add SYCL to. # SOURCES : Source files to be compiled for SYCL. diff --git a/libs/eigen/cmake/FindEigen2.cmake b/libs/eigen/cmake/FindEigen2.cmake deleted file mode 100644 index eb2709d..0000000 --- a/libs/eigen/cmake/FindEigen2.cmake +++ /dev/null @@ -1,80 +0,0 @@ -# - Try to find Eigen2 lib -# -# This module supports requiring a minimum version, e.g. you can do -# find_package(Eigen2 2.0.3) -# to require version 2.0.3 to newer of Eigen2. -# -# Once done this will define -# -# EIGEN2_FOUND - system has eigen lib with correct version -# EIGEN2_INCLUDE_DIR - the eigen include directory -# EIGEN2_VERSION - eigen version - -# Copyright (c) 2006, 2007 Montel Laurent, -# Copyright (c) 2008, 2009 Gael Guennebaud, -# Redistribution and use is allowed according to the terms of the BSD license. - -if(NOT Eigen2_FIND_VERSION) - if(NOT Eigen2_FIND_VERSION_MAJOR) - set(Eigen2_FIND_VERSION_MAJOR 2) - endif() - if(NOT Eigen2_FIND_VERSION_MINOR) - set(Eigen2_FIND_VERSION_MINOR 0) - endif() - if(NOT Eigen2_FIND_VERSION_PATCH) - set(Eigen2_FIND_VERSION_PATCH 0) - endif() - - set(Eigen2_FIND_VERSION "${Eigen2_FIND_VERSION_MAJOR}.${Eigen2_FIND_VERSION_MINOR}.${Eigen2_FIND_VERSION_PATCH}") -endif() - -macro(_eigen2_check_version) - file(READ "${EIGEN2_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen2_version_header) - - string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen2_world_version_match "${_eigen2_version_header}") - set(EIGEN2_WORLD_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen2_major_version_match "${_eigen2_version_header}") - set(EIGEN2_MAJOR_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen2_minor_version_match "${_eigen2_version_header}") - set(EIGEN2_MINOR_VERSION "${CMAKE_MATCH_1}") - - set(EIGEN2_VERSION ${EIGEN2_WORLD_VERSION}.${EIGEN2_MAJOR_VERSION}.${EIGEN2_MINOR_VERSION}) - if((${EIGEN2_WORLD_VERSION} NOTEQUAL 2) OR (${EIGEN2_MAJOR_VERSION} GREATER 10) OR (${EIGEN2_VERSION} VERSION_LESS ${Eigen2_FIND_VERSION})) - set(EIGEN2_VERSION_OK FALSE) - else() - set(EIGEN2_VERSION_OK TRUE) - endif() - - if(NOT EIGEN2_VERSION_OK) - - message(STATUS "Eigen2 version ${EIGEN2_VERSION} found in ${EIGEN2_INCLUDE_DIR}, " - "but at least version ${Eigen2_FIND_VERSION} is required") - endif() -endmacro() - -if (EIGEN2_INCLUDE_DIR) - - # in cache already - _eigen2_check_version() - set(EIGEN2_FOUND ${EIGEN2_VERSION_OK}) - -else () - -find_path(EIGEN2_INCLUDE_DIR NAMES Eigen/Core - PATHS - ${INCLUDE_INSTALL_DIR} - ${KDE4_INCLUDE_DIR} - PATH_SUFFIXES eigen2 - ) - -if(EIGEN2_INCLUDE_DIR) - _eigen2_check_version() -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(Eigen2 DEFAULT_MSG EIGEN2_INCLUDE_DIR EIGEN2_VERSION_OK) - -mark_as_advanced(EIGEN2_INCLUDE_DIR) - -endif() - diff --git a/libs/eigen/cmake/FindEigen3.cmake b/libs/eigen/cmake/FindEigen3.cmake deleted file mode 100644 index 0b36805..0000000 --- a/libs/eigen/cmake/FindEigen3.cmake +++ /dev/null @@ -1,107 +0,0 @@ -# - Try to find Eigen3 lib -# -# This module supports requiring a minimum version, e.g. you can do -# find_package(Eigen3 3.1.2) -# to require version 3.1.2 or newer of Eigen3. -# -# Once done this will define -# -# EIGEN3_FOUND - system has eigen lib with correct version -# EIGEN3_INCLUDE_DIR - the eigen include directory -# EIGEN3_VERSION - eigen version -# -# and the following imported target: -# -# Eigen3::Eigen - The header-only Eigen library -# -# This module reads hints about search locations from -# the following environment variables: -# -# EIGEN3_ROOT -# EIGEN3_ROOT_DIR - -# Copyright (c) 2006, 2007 Montel Laurent, -# Copyright (c) 2008, 2009 Gael Guennebaud, -# Copyright (c) 2009 Benoit Jacob -# Redistribution and use is allowed according to the terms of the 2-clause BSD license. - -if(NOT Eigen3_FIND_VERSION) - if(NOT Eigen3_FIND_VERSION_MAJOR) - set(Eigen3_FIND_VERSION_MAJOR 2) - endif() - if(NOT Eigen3_FIND_VERSION_MINOR) - set(Eigen3_FIND_VERSION_MINOR 91) - endif() - if(NOT Eigen3_FIND_VERSION_PATCH) - set(Eigen3_FIND_VERSION_PATCH 0) - endif() - - set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}") -endif() - -macro(_eigen3_check_version) - file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header) - - string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}") - set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}") - set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}") - set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}") - - set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION}) - if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) - set(EIGEN3_VERSION_OK FALSE) - else() - set(EIGEN3_VERSION_OK TRUE) - endif() - - if(NOT EIGEN3_VERSION_OK) - - message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, " - "but at least version ${Eigen3_FIND_VERSION} is required") - endif() -endmacro() - -if (EIGEN3_INCLUDE_DIR) - - # in cache already - _eigen3_check_version() - set(EIGEN3_FOUND ${EIGEN3_VERSION_OK}) - set(Eigen3_FOUND ${EIGEN3_VERSION_OK}) - -else () - - # search first if an Eigen3Config.cmake is available in the system, - # if successful this would set EIGEN3_INCLUDE_DIR and the rest of - # the script will work as usual - find_package(Eigen3 ${Eigen3_FIND_VERSION} NO_MODULE QUIET) - - if(NOT EIGEN3_INCLUDE_DIR) - find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library - HINTS - ENV EIGEN3_ROOT - ENV EIGEN3_ROOT_DIR - PATHS - ${CMAKE_INSTALL_PREFIX}/include - ${KDE4_INCLUDE_DIR} - PATH_SUFFIXES eigen3 eigen - ) - endif() - - if(EIGEN3_INCLUDE_DIR) - _eigen3_check_version() - endif() - - include(FindPackageHandleStandardArgs) - find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK) - - mark_as_advanced(EIGEN3_INCLUDE_DIR) - -endif() - -if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen) - add_library(Eigen3::Eigen INTERFACE IMPORTED) - set_target_properties(Eigen3::Eigen PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}") -endif() diff --git a/libs/eigen/cmake/FindGLEW.cmake b/libs/eigen/cmake/FindGLEW.cmake deleted file mode 100644 index 9d486d5..0000000 --- a/libs/eigen/cmake/FindGLEW.cmake +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2009 Boudewijn Rempt -# -# Redistribution and use is allowed according to the terms of the BSD license. -# For details see the accompanying COPYING-CMAKE-SCRIPTS file. -# -# - try to find glew library and include files -# GLEW_INCLUDE_DIR, where to find GL/glew.h, etc. -# GLEW_LIBRARIES, the libraries to link against -# GLEW_FOUND, If false, do not try to use GLEW. -# Also defined, but not for general use are: -# GLEW_GLEW_LIBRARY = the full path to the glew library. - -if (WIN32) - - if(CYGWIN) - - find_path( GLEW_INCLUDE_DIR GL/glew.h) - - find_library( GLEW_GLEW_LIBRARY glew32 - ${OPENGL_LIBRARY_DIR} - /usr/lib/w32api - /usr/X11R6/lib - ) - - - else(CYGWIN) - - find_path( GLEW_INCLUDE_DIR GL/glew.h - $ENV{GLEW_ROOT_PATH}/include - ) - - find_library( GLEW_GLEW_LIBRARY - NAMES glew glew32 - PATHS - $ENV{GLEW_ROOT_PATH}/lib - ${OPENGL_LIBRARY_DIR} - ) - - endif(CYGWIN) - -else (WIN32) - - if (APPLE) -# These values for Apple could probably do with improvement. - find_path( GLEW_INCLUDE_DIR glew.h - /System/Library/Frameworks/GLEW.framework/Versions/A/Headers - ${OPENGL_LIBRARY_DIR} - ) - set(GLEW_GLEW_LIBRARY "-framework GLEW" CACHE STRING "GLEW library for OSX") - set(GLEW_cocoa_LIBRARY "-framework Cocoa" CACHE STRING "Cocoa framework for OSX") - else (APPLE) - - find_path( GLEW_INCLUDE_DIR GL/glew.h - /usr/include/GL - /usr/openwin/share/include - /usr/openwin/include - /usr/X11R6/include - /usr/include/X11 - /opt/graphics/OpenGL/include - /opt/graphics/OpenGL/contrib/libglew - ) - - find_library( GLEW_GLEW_LIBRARY GLEW - /usr/openwin/lib - /usr/X11R6/lib - ) - - endif (APPLE) - -endif (WIN32) - -set( GLEW_FOUND "NO" ) -if(GLEW_INCLUDE_DIR) - if(GLEW_GLEW_LIBRARY) - # Is -lXi and -lXmu required on all platforms that have it? - # If not, we need some way to figure out what platform we are on. - set( GLEW_LIBRARIES - ${GLEW_GLEW_LIBRARY} - ${GLEW_cocoa_LIBRARY} - ) - set( GLEW_FOUND "YES" ) - -#The following deprecated settings are for backwards compatibility with CMake1.4 - set (GLEW_LIBRARY ${GLEW_LIBRARIES}) - set (GLEW_INCLUDE_PATH ${GLEW_INCLUDE_DIR}) - - endif(GLEW_GLEW_LIBRARY) -endif(GLEW_INCLUDE_DIR) - -if(GLEW_FOUND) - if(NOT GLEW_FIND_QUIETLY) - message(STATUS "Found Glew: ${GLEW_LIBRARIES}") - endif(NOT GLEW_FIND_QUIETLY) -else(GLEW_FOUND) - if(GLEW_FIND_REQUIRED) - message(FATAL_ERROR "Could not find Glew") - endif(GLEW_FIND_REQUIRED) -endif(GLEW_FOUND) - -mark_as_advanced( - GLEW_INCLUDE_DIR - GLEW_GLEW_LIBRARY - GLEW_Xmu_LIBRARY - GLEW_Xi_LIBRARY -) diff --git a/libs/eigen/cmake/FindGSL.cmake b/libs/eigen/cmake/FindGSL.cmake deleted file mode 100644 index 8632232..0000000 --- a/libs/eigen/cmake/FindGSL.cmake +++ /dev/null @@ -1,170 +0,0 @@ -# Try to find gnu scientific library GSL -# See -# http://www.gnu.org/software/gsl/ and -# http://gnuwin32.sourceforge.net/packages/gsl.htm -# -# Once run this will define: -# -# GSL_FOUND = system has GSL lib -# -# GSL_LIBRARIES = full path to the libraries -# on Unix/Linux with additional linker flags from "gsl-config --libs" -# -# CMAKE_GSL_CXX_FLAGS = Unix compiler flags for GSL, essentially "`gsl-config --cxxflags`" -# -# GSL_INCLUDE_DIR = where to find headers -# -# GSL_LINK_DIRECTORIES = link directories, useful for rpath on Unix -# GSL_EXE_LINKER_FLAGS = rpath on Unix -# -# Felix Woelk 07/2004 -# Jan Woetzel -# -# www.mip.informatik.uni-kiel.de -# -------------------------------- - -if(WIN32) - # JW tested with gsl-1.8, Windows XP, MSVS 7.1 - set(GSL_POSSIBLE_ROOT_DIRS - ${GSL_ROOT_DIR} - $ENV{GSL_ROOT_DIR} - ${GSL_DIR} - ${GSL_HOME} - $ENV{GSL_DIR} - $ENV{GSL_HOME} - $ENV{EXTRA} - "C:/Program Files/GnuWin32" - ) - find_path(GSL_INCLUDE_DIR - NAMES gsl/gsl_cdf.h gsl/gsl_randist.h - PATHS ${GSL_POSSIBLE_ROOT_DIRS} - PATH_SUFFIXES include - DOC "GSL header include dir" - ) - - find_library(GSL_GSL_LIBRARY - NAMES libgsl.dll.a gsl libgsl - PATHS ${GSL_POSSIBLE_ROOT_DIRS} - PATH_SUFFIXES lib - DOC "GSL library" ) - - if(NOT GSL_GSL_LIBRARY) - find_file(GSL_GSL_LIBRARY - NAMES libgsl.dll.a - PATHS ${GSL_POSSIBLE_ROOT_DIRS} - PATH_SUFFIXES lib - DOC "GSL library") - endif() - - find_library(GSL_GSLCBLAS_LIBRARY - NAMES libgslcblas.dll.a gslcblas libgslcblas - PATHS ${GSL_POSSIBLE_ROOT_DIRS} - PATH_SUFFIXES lib - DOC "GSL cblas library dir" ) - - if(NOT GSL_GSLCBLAS_LIBRARY) - find_file(GSL_GSLCBLAS_LIBRARY - NAMES libgslcblas.dll.a - PATHS ${GSL_POSSIBLE_ROOT_DIRS} - PATH_SUFFIXES lib - DOC "GSL library") - endif() - - set(GSL_LIBRARIES ${GSL_GSL_LIBRARY}) - - #message("DBG\n" - # "GSL_GSL_LIBRARY=${GSL_GSL_LIBRARY}\n" - # "GSL_GSLCBLAS_LIBRARY=${GSL_GSLCBLAS_LIBRARY}\n" - # "GSL_LIBRARIES=${GSL_LIBRARIES}") - - -else(WIN32) - - if(UNIX) - set(GSL_CONFIG_PREFER_PATH - "$ENV{GSL_DIR}/bin" - "$ENV{GSL_DIR}" - "$ENV{GSL_HOME}/bin" - "$ENV{GSL_HOME}" - CACHE STRING "preferred path to GSL (gsl-config)") - find_program(GSL_CONFIG gsl-config - ${GSL_CONFIG_PREFER_PATH} - /usr/bin/ - ) - # message("DBG GSL_CONFIG ${GSL_CONFIG}") - - if (GSL_CONFIG) - # set CXXFLAGS to be fed into CXX_FLAGS by the user: - set(GSL_CXX_FLAGS "`${GSL_CONFIG} --cflags`") - - # set INCLUDE_DIRS to prefix+include - exec_program(${GSL_CONFIG} - ARGS --prefix - OUTPUT_VARIABLE GSL_PREFIX) - set(GSL_INCLUDE_DIR ${GSL_PREFIX}/include CACHE STRING INTERNAL) - - # set link libraries and link flags - #set(GSL_LIBRARIES "`${GSL_CONFIG} --libs`") - exec_program(${GSL_CONFIG} - ARGS --libs - OUTPUT_VARIABLE GSL_LIBRARIES ) - - # extract link dirs for rpath - exec_program(${GSL_CONFIG} - ARGS --libs - OUTPUT_VARIABLE GSL_CONFIG_LIBS ) - - # extract version - exec_program(${GSL_CONFIG} - ARGS --version - OUTPUT_VARIABLE GSL_FULL_VERSION ) - - # split version as major/minor - string(REGEX MATCH "(.)\\..*" GSL_VERSION_MAJOR_ "${GSL_FULL_VERSION}") - set(GSL_VERSION_MAJOR ${CMAKE_MATCH_1}) - string(REGEX MATCH ".\\.(.*)" GSL_VERSION_MINOR_ "${GSL_FULL_VERSION}") - set(GSL_VERSION_MINOR ${CMAKE_MATCH_1}) - - # split off the link dirs (for rpath) - # use regular expression to match wildcard equivalent "-L*" - # with is a space or a semicolon - string(REGEX MATCHALL "[-][L]([^ ;])+" - GSL_LINK_DIRECTORIES_WITH_PREFIX - "${GSL_CONFIG_LIBS}" ) - # message("DBG GSL_LINK_DIRECTORIES_WITH_PREFIX=${GSL_LINK_DIRECTORIES_WITH_PREFIX}") - - # remove prefix -L because we need the pure directory for LINK_DIRECTORIES - - if (GSL_LINK_DIRECTORIES_WITH_PREFIX) - string(REGEX REPLACE "[-][L]" "" GSL_LINK_DIRECTORIES ${GSL_LINK_DIRECTORIES_WITH_PREFIX} ) - endif (GSL_LINK_DIRECTORIES_WITH_PREFIX) - set(GSL_EXE_LINKER_FLAGS "-Wl,-rpath,${GSL_LINK_DIRECTORIES}" CACHE STRING INTERNAL) - # message("DBG GSL_LINK_DIRECTORIES=${GSL_LINK_DIRECTORIES}") - # message("DBG GSL_EXE_LINKER_FLAGS=${GSL_EXE_LINKER_FLAGS}") - - # add_definitions("-DHAVE_GSL") - # set(GSL_DEFINITIONS "-DHAVE_GSL") - mark_as_advanced( - GSL_CXX_FLAGS - GSL_INCLUDE_DIR - GSL_LIBRARIES - GSL_LINK_DIRECTORIES - GSL_DEFINITIONS - ) - message(STATUS "Using GSL from ${GSL_PREFIX}") - - else(GSL_CONFIG) - message("FindGSL.cmake: gsl-config not found. Please set it manually. GSL_CONFIG=${GSL_CONFIG}") - endif(GSL_CONFIG) - - endif(UNIX) -endif(WIN32) - - -if(GSL_LIBRARIES) - if(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS) - - set(GSL_FOUND 1) - - endif(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS) -endif(GSL_LIBRARIES) diff --git a/libs/eigen/cmake/FindLAPACK.cmake b/libs/eigen/cmake/FindLAPACK.cmake deleted file mode 100644 index 3fd7388..0000000 --- a/libs/eigen/cmake/FindLAPACK.cmake +++ /dev/null @@ -1,274 +0,0 @@ -# Find LAPACK library -# -# This module finds an installed library that implements the LAPACK -# linear-algebra interface (see http://www.netlib.org/lapack/). -# The approach follows mostly that taken for the autoconf macro file, acx_lapack.m4 -# (distributed at http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html). -# -# This module sets the following variables: -# LAPACK_FOUND - set to true if a library implementing the LAPACK interface -# is found -# LAPACK_INCLUDE_DIR - Directories containing the LAPACK header files -# LAPACK_DEFINITIONS - Compilation options to use LAPACK -# LAPACK_LINKER_FLAGS - Linker flags to use LAPACK (excluding -l -# and -L). -# LAPACK_LIBRARIES_DIR - Directories containing the LAPACK libraries. -# May be null if LAPACK_LIBRARIES contains libraries name using full path. -# LAPACK_LIBRARIES - List of libraries to link against LAPACK interface. -# May be null if the compiler supports auto-link (e.g. VC++). -# LAPACK_USE_FILE - The name of the cmake module to include to compile -# applications or libraries using LAPACK. -# -# This module was modified by CGAL team: -# - find libraries for a C++ compiler, instead of Fortran -# - added LAPACK_INCLUDE_DIR, LAPACK_DEFINITIONS and LAPACK_LIBRARIES_DIR -# - removed LAPACK95_LIBRARIES - - -include(CheckFunctionExists) -include(CMakeFindDependencyMacro) - -# This macro checks for the existence of the combination of fortran libraries -# given by _list. If the combination is found, this macro checks (using the -# check_function_exists macro) whether can link against that library -# combination using the name of a routine given by _name using the linker -# flags given by _flags. If the combination of libraries is found and passes -# the link test, LIBRARIES is set to the list of complete library paths that -# have been found and DEFINITIONS to the required definitions. -# Otherwise, LIBRARIES is set to FALSE. -# N.B. _prefix is the prefix applied to the names of all cached variables that -# are generated internally and marked advanced by this macro. -macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _blas _path) - #message("DEBUG: check_lapack_libraries(${_list} in ${_path} with ${_blas})") - - # Check for the existence of the libraries given by _list - set(_libraries_found TRUE) - set(_libraries_work FALSE) - set(${DEFINITIONS} "") - set(${LIBRARIES} "") - set(_combined_name) - foreach(_library ${_list}) - set(_combined_name ${_combined_name}_${_library}) - - if(_libraries_found) - # search first in ${_path} - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS ${_path} NO_DEFAULT_PATH - ) - # if not found, search in environment variables and system - if ( WIN32 ) - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS ENV LIB - ) - elseif ( APPLE ) - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV DYLD_LIBRARY_PATH - ) - else () - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH - ) - endif() - mark_as_advanced(${_prefix}_${_library}_LIBRARY) - set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) - set(_libraries_found ${${_prefix}_${_library}_LIBRARY}) - endif() - endforeach() - if(_libraries_found) - set(_libraries_found ${${LIBRARIES}}) - endif() - - # Test this combination of libraries with the Fortran/f2c interface. - # We test the Fortran interface first as it is well standardized. - if(_libraries_found AND NOT _libraries_work) - set(${DEFINITIONS} "-D${_prefix}_USE_F2C") - set(${LIBRARIES} ${_libraries_found}) - # Some C++ linkers require the f2c library to link with Fortran libraries. - # I do not know which ones, thus I just add the f2c library if it is available. - find_dependency( F2C QUIET ) - if ( F2C_FOUND ) - set(${DEFINITIONS} ${${DEFINITIONS}} ${F2C_DEFINITIONS}) - set(${LIBRARIES} ${${LIBRARIES}} ${F2C_LIBRARIES}) - endif() - set(CMAKE_REQUIRED_DEFINITIONS ${${DEFINITIONS}}) - set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas}) - #message("DEBUG: CMAKE_REQUIRED_DEFINITIONS = ${CMAKE_REQUIRED_DEFINITIONS}") - #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") - # Check if function exists with f2c calling convention (ie a trailing underscore) - check_function_exists(${_name}_ ${_prefix}_${_name}_${_combined_name}_f2c_WORKS) - set(CMAKE_REQUIRED_DEFINITIONS} "") - set(CMAKE_REQUIRED_LIBRARIES "") - mark_as_advanced(${_prefix}_${_name}_${_combined_name}_f2c_WORKS) - set(_libraries_work ${${_prefix}_${_name}_${_combined_name}_f2c_WORKS}) - endif() - - # If not found, test this combination of libraries with a C interface. - # A few implementations (ie ACML) provide a C interface. Unfortunately, there is no standard. - if(_libraries_found AND NOT _libraries_work) - set(${DEFINITIONS} "") - set(${LIBRARIES} ${_libraries_found}) - set(CMAKE_REQUIRED_DEFINITIONS "") - set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas}) - #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") - check_function_exists(${_name} ${_prefix}_${_name}${_combined_name}_WORKS) - set(CMAKE_REQUIRED_LIBRARIES "") - mark_as_advanced(${_prefix}_${_name}${_combined_name}_WORKS) - set(_libraries_work ${${_prefix}_${_name}${_combined_name}_WORKS}) - endif() - - # on failure - if(NOT _libraries_work) - set(${DEFINITIONS} "") - set(${LIBRARIES} FALSE) - endif() - #message("DEBUG: ${DEFINITIONS} = ${${DEFINITIONS}}") - #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}") -endmacro() - - -# -# main -# - -# LAPACK requires BLAS -if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED) - find_dependency(BLAS) -else() - find_dependency(BLAS REQUIRED) -endif() - -if (NOT BLAS_FOUND) - - message(STATUS "LAPACK requires BLAS.") - set(LAPACK_FOUND FALSE) - -# Is it already configured? -elseif (LAPACK_LIBRARIES_DIR OR LAPACK_LIBRARIES) - - set(LAPACK_FOUND TRUE) - -else() - - # reset variables - set( LAPACK_INCLUDE_DIR "" ) - set( LAPACK_DEFINITIONS "" ) - set( LAPACK_LINKER_FLAGS "" ) # unused (yet) - set( LAPACK_LIBRARIES "" ) - set( LAPACK_LIBRARIES_DIR "" ) - - # - # If Unix, search for LAPACK function in possible libraries - # - - #intel mkl lapack? - if(NOT LAPACK_LIBRARIES) - check_lapack_libraries( - LAPACK_DEFINITIONS - LAPACK_LIBRARIES - LAPACK - cheev - "" - "mkl_lapack" - "${BLAS_LIBRARIES}" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" - ) - endif() - - #acml lapack? - if(NOT LAPACK_LIBRARIES) - check_lapack_libraries( - LAPACK_DEFINITIONS - LAPACK_LIBRARIES - LAPACK - cheev - "" - "acml" - "${BLAS_LIBRARIES}" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" - ) - endif() - - # Apple LAPACK library? - if(NOT LAPACK_LIBRARIES) - check_lapack_libraries( - LAPACK_DEFINITIONS - LAPACK_LIBRARIES - LAPACK - cheev - "" - "Accelerate" - "${BLAS_LIBRARIES}" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" - ) - endif() - - if ( NOT LAPACK_LIBRARIES ) - check_lapack_libraries( - LAPACK_DEFINITIONS - LAPACK_LIBRARIES - LAPACK - cheev - "" - "vecLib" - "${BLAS_LIBRARIES}" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" - ) - endif () - - # Generic LAPACK library? - # This configuration *must* be the last try as this library is notably slow. - if ( NOT LAPACK_LIBRARIES ) - check_lapack_libraries( - LAPACK_DEFINITIONS - LAPACK_LIBRARIES - LAPACK - cheev - "" - "lapack" - "${BLAS_LIBRARIES}" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" - ) - endif() - - if(LAPACK_LIBRARIES_DIR OR LAPACK_LIBRARIES) - set(LAPACK_FOUND TRUE) - else() - set(LAPACK_FOUND FALSE) - endif() - - if(NOT LAPACK_FIND_QUIETLY) - if(LAPACK_FOUND) - message(STATUS "A library with LAPACK API found.") - else() - if(LAPACK_FIND_REQUIRED) - message(FATAL_ERROR "A required library with LAPACK API not found. Please specify library location.") - else() - message(STATUS "A library with LAPACK API not found. Please specify library location.") - endif() - endif() - endif() - - # Add variables to cache - set( LAPACK_INCLUDE_DIR "${LAPACK_INCLUDE_DIR}" - CACHE PATH "Directories containing the LAPACK header files" FORCE ) - set( LAPACK_DEFINITIONS "${LAPACK_DEFINITIONS}" - CACHE STRING "Compilation options to use LAPACK" FORCE ) - set( LAPACK_LINKER_FLAGS "${LAPACK_LINKER_FLAGS}" - CACHE STRING "Linker flags to use LAPACK" FORCE ) - set( LAPACK_LIBRARIES "${LAPACK_LIBRARIES}" - CACHE FILEPATH "LAPACK libraries name" FORCE ) - set( LAPACK_LIBRARIES_DIR "${LAPACK_LIBRARIES_DIR}" - CACHE PATH "Directories containing the LAPACK libraries" FORCE ) - - #message("DEBUG: LAPACK_INCLUDE_DIR = ${LAPACK_INCLUDE_DIR}") - #message("DEBUG: LAPACK_DEFINITIONS = ${LAPACK_DEFINITIONS}") - #message("DEBUG: LAPACK_LINKER_FLAGS = ${LAPACK_LINKER_FLAGS}") - #message("DEBUG: LAPACK_LIBRARIES = ${LAPACK_LIBRARIES}") - #message("DEBUG: LAPACK_LIBRARIES_DIR = ${LAPACK_LIBRARIES_DIR}") - #message("DEBUG: LAPACK_FOUND = ${LAPACK_FOUND}") - -endif() diff --git a/libs/eigen/cmake/UseEigen3.cmake b/libs/eigen/cmake/UseEigen3.cmake deleted file mode 100644 index a38bac8..0000000 --- a/libs/eigen/cmake/UseEigen3.cmake +++ /dev/null @@ -1,6 +0,0 @@ -# -*- cmake -*- -# -# UseEigen3.cmake - -add_definitions ( ${EIGEN3_DEFINITIONS} ) -include_directories ( ${EIGEN3_INCLUDE_DIRS} ) diff --git a/libs/eigen/debug/gdb/printers.py b/libs/eigen/debug/gdb/printers.py index 24961d1..2c3fccf 100644 --- a/libs/eigen/debug/gdb/printers.py +++ b/libs/eigen/debug/gdb/printers.py @@ -22,29 +22,29 @@ # import sys # sys.path.insert(0, '/path/to/eigen/printer/directory') # from printers import register_eigen_printers -# register_eigen_printers (None) +# register_eigen_printers(None) # end import gdb import re -import itertools from bisect import bisect_left + # Basic row/column iteration code for use with Sparse and Dense matrices class _MatrixEntryIterator(object): - def __init__ (self, rows, cols, rowMajor): + def __init__(self, rows, cols, row_major): self.rows = rows self.cols = cols self.currentRow = 0 self.currentCol = 0 - self.rowMajor = rowMajor + self.rowMajor = row_major - def __iter__ (self): + def __iter__(self): return self def next(self): - return self.__next__() # Python 2.x compatibility + return self.__next__() # Python 2.x compatibility def __next__(self): row = self.currentRow @@ -53,54 +53,55 @@ class _MatrixEntryIterator(object): if self.currentCol >= self.cols: raise StopIteration - self.currentRow = self.currentRow + 1 + self.currentRow += 1 if self.currentRow >= self.rows: self.currentRow = 0 - self.currentCol = self.currentCol + 1 + self.currentCol += 1 else: if self.currentRow >= self.rows: raise StopIteration - self.currentCol = self.currentCol + 1 + self.currentCol += 1 if self.currentCol >= self.cols: self.currentCol = 0 - self.currentRow = self.currentRow + 1 + self.currentRow += 1 + + return row, col - return (row, col) class EigenMatrixPrinter: - "Print Eigen Matrix or Array of some kind" + """Print Eigen Matrix or Array of some kind""" def __init__(self, variety, val): - "Extract all the necessary information" + """Extract all the necessary information""" # Save the variety (presumably "Matrix" or "Array") for later usage self.variety = variety # The gdb extension does not support value template arguments - need to extract them by hand - type = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - self.type = type.unqualified().strip_typedefs() + typeinfo = val.type + if typeinfo.code == gdb.TYPE_CODE_REF: + typeinfo = typeinfo.target() + self.type = typeinfo.unqualified().strip_typedefs() tag = self.type.tag - regex = re.compile('\<.*\>') + regex = re.compile('<.*>') m = regex.findall(tag)[0][1:-1] template_params = m.split(',') template_params = [x.replace(" ", "") for x in template_params] - if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1': + if template_params[1] in ['-0x00000000000000001', '-0x000000001', '-1']: self.rows = val['m_storage']['m_rows'] else: self.rows = int(template_params[1]) - if template_params[2] == '-0x00000000000000001' or template_params[2] == '-0x000000001' or template_params[2] == '-1': + if template_params[2] in ['-0x00000000000000001', '-0x000000001', '-1']: self.cols = val['m_storage']['m_cols'] else: self.cols = int(template_params[2]) - self.options = 0 # default value + self.options = 0 # default value if len(template_params) > 3: - self.options = template_params[3]; + self.options = template_params[3] self.rowMajor = (int(self.options) & 0x1) @@ -114,50 +115,51 @@ class EigenMatrixPrinter: self.data = self.data['array'] self.data = self.data.cast(self.innerType.pointer()) - class _iterator(_MatrixEntryIterator): - def __init__ (self, rows, cols, dataPtr, rowMajor): - super(EigenMatrixPrinter._iterator, self).__init__(rows, cols, rowMajor) + class _Iterator(_MatrixEntryIterator): + def __init__(self, rows, cols, data_ptr, row_major): + super(EigenMatrixPrinter._Iterator, self).__init__(rows, cols, row_major) - self.dataPtr = dataPtr + self.dataPtr = data_ptr def __next__(self): - - row, col = super(EigenMatrixPrinter._iterator, self).__next__() + row, col = super(EigenMatrixPrinter._Iterator, self).__next__() item = self.dataPtr.dereference() - self.dataPtr = self.dataPtr + 1 - if (self.cols == 1): #if it's a column vector - return ('[%d]' % (row,), item) - elif (self.rows == 1): #if it's a row vector - return ('[%d]' % (col,), item) - return ('[%d,%d]' % (row, col), item) + self.dataPtr += 1 + if self.cols == 1: # if it's a column vector + return '[%d]' % (row,), item + elif self.rows == 1: # if it's a row vector + return '[%d]' % (col,), item + return '[%d,%d]' % (row, col), item def children(self): - - return self._iterator(self.rows, self.cols, self.data, self.rowMajor) + return self._Iterator(self.rows, self.cols, self.data, self.rowMajor) def to_string(self): - return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % (self.variety, self.innerType, self.rows, self.cols, "RowMajor" if self.rowMajor else "ColMajor", self.data) + return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % ( + self.variety, self.innerType, self.rows, self.cols, + "RowMajor" if self.rowMajor else "ColMajor", self.data) + class EigenSparseMatrixPrinter: - "Print an Eigen SparseMatrix" + """Print an Eigen SparseMatrix""" def __init__(self, val): - "Extract all the necessary information" + """Extract all the necessary information""" - type = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - self.type = type.unqualified().strip_typedefs() + typeinfo = val.type + if typeinfo.code == gdb.TYPE_CODE_REF: + typeinfo = typeinfo.target() + self.type = typeinfo.unqualified().strip_typedefs() tag = self.type.tag - regex = re.compile('\<.*\>') + regex = re.compile('<.*>') m = regex.findall(tag)[0][1:-1] template_params = m.split(',') template_params = [x.replace(" ", "") for x in template_params] self.options = 0 if len(template_params) > 1: - self.options = template_params[1]; + self.options = template_params[1] self.rowMajor = (int(self.options) & 0x1) @@ -168,22 +170,23 @@ class EigenSparseMatrixPrinter: self.data = self.val['m_data'] self.data = self.data.cast(self.innerType.pointer()) - class _iterator(_MatrixEntryIterator): - def __init__ (self, rows, cols, val, rowMajor): - super(EigenSparseMatrixPrinter._iterator, self).__init__(rows, cols, rowMajor) + class _Iterator(_MatrixEntryIterator): + def __init__(self, rows, cols, val, row_major): + super(EigenSparseMatrixPrinter._Iterator, self).__init__(rows, cols, row_major) self.val = val def __next__(self): - - row, col = super(EigenSparseMatrixPrinter._iterator, self).__next__() + row, col = super(EigenSparseMatrixPrinter._Iterator, self).__next__() # repeat calculations from SparseMatrix.h: outer = row if self.rowMajor else col inner = col if self.rowMajor else row start = self.val['m_outerIndex'][outer] - end = ((start + self.val['m_innerNonZeros'][outer]) if self.val['m_innerNonZeros'] else - self.val['m_outerIndex'][outer+1]) + end = ( + (start + self.val['m_innerNonZeros'][outer]) + if self.val['m_innerNonZeros'] else self.val['m_outerIndex'][outer+1] + ) # and from CompressedStorage.h: data = self.val['m_data'] @@ -196,20 +199,19 @@ class EigenSparseMatrixPrinter: indices = [data['m_indices'][x] for x in range(int(start), int(end)-1)] # find the index with binary search idx = int(start) + bisect_left(indices, inner) - if ((idx < end) and (data['m_indices'][idx] == inner)): + if idx < end and data['m_indices'][idx] == inner: item = data['m_values'][idx] else: item = 0 - return ('[%d,%d]' % (row, col), item) + return '[%d,%d]' % (row, col), item def children(self): if self.data: - return self._iterator(self.rows(), self.cols(), self.val, self.rowMajor) + return self._Iterator(self.rows(), self.cols(), self.val, self.rowMajor) return iter([]) # empty matrix, for now - def rows(self): return self.val['m_outerSize'] if self.rowMajor else self.val['m_innerSize'] @@ -222,22 +224,23 @@ class EigenSparseMatrixPrinter: status = ("not compressed" if self.val['m_innerNonZeros'] else "compressed") else: status = "empty" - dimensions = "%d x %d" % (self.rows(), self.cols()) - layout = "row" if self.rowMajor else "column" + dimensions = "%d x %d" % (self.rows(), self.cols()) + layout = "row" if self.rowMajor else "column" return "Eigen::SparseMatrix<%s>, %s, %s major, %s" % ( - self.innerType, dimensions, layout, status ) + self.innerType, dimensions, layout, status) + class EigenQuaternionPrinter: - "Print an Eigen Quaternion" + """Print an Eigen Quaternion""" def __init__(self, val): - "Extract all the necessary information" + """Extract all the necessary information""" # The gdb extension does not support value template arguments - need to extract them by hand - type = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - self.type = type.unqualified().strip_typedefs() + typeinfo = val.type + if typeinfo.code == gdb.TYPE_CODE_REF: + typeinfo = typeinfo.target() + self.type = typeinfo.unqualified().strip_typedefs() self.innerType = self.type.template_argument(0) self.val = val @@ -245,13 +248,13 @@ class EigenQuaternionPrinter: self.data = self.val['m_coeffs']['m_storage']['m_data']['array'] self.data = self.data.cast(self.innerType.pointer()) - class _iterator: - def __init__ (self, dataPtr): - self.dataPtr = dataPtr + class _Iterator: + def __init__(self, data_ptr): + self.dataPtr = data_ptr self.currentElement = 0 self.elementNames = ['x', 'y', 'z', 'w'] - def __iter__ (self): + def __iter__(self): return self def next(self): @@ -260,47 +263,67 @@ class EigenQuaternionPrinter: def __next__(self): element = self.currentElement - if self.currentElement >= 4: #there are 4 elements in a quanternion + if self.currentElement >= 4: # there are 4 elements in a quaternion raise StopIteration - self.currentElement = self.currentElement + 1 + self.currentElement += 1 item = self.dataPtr.dereference() - self.dataPtr = self.dataPtr + 1 - return ('[%s]' % (self.elementNames[element],), item) + self.dataPtr += 1 + return '[%s]' % (self.elementNames[element],), item def children(self): - - return self._iterator(self.data) + return self._Iterator(self.data) def to_string(self): return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data) -def build_eigen_dictionary (): + +def cast_eigen_block_to_matrix(val): + # Get the type of the variable (and convert to a string) + # Example: 'const Eigen::Block, -1, -1, false> const, -1, -1, false>' + val_type = str(val.type) + + # Extract the Eigen::Matrix type from the Block: + # From the previous example: Eigen::Matrix + begin = val_type.find('Eigen::Matrix<') + end = val_type.find('>', begin) + 1 + + # Convert the Eigen::Block to an Eigen::Matrix + return val.cast(gdb.lookup_type(val_type[begin:end])) + + +def build_eigen_dictionary(): pretty_printers_dict[re.compile('^Eigen::Quaternion<.*>$')] = lambda val: EigenQuaternionPrinter(val) pretty_printers_dict[re.compile('^Eigen::Matrix<.*>$')] = lambda val: EigenMatrixPrinter("Matrix", val) + pretty_printers_dict[re.compile('^Eigen::Block<.*>$')] =\ + lambda val: EigenMatrixPrinter("Matrix", cast_eigen_block_to_matrix(val)) + pretty_printers_dict[re.compile('^Eigen::VectorBlock<.*>$')] =\ + lambda val: EigenMatrixPrinter("Matrix", cast_eigen_block_to_matrix(val)) pretty_printers_dict[re.compile('^Eigen::SparseMatrix<.*>$')] = lambda val: EigenSparseMatrixPrinter(val) - pretty_printers_dict[re.compile('^Eigen::Array<.*>$')] = lambda val: EigenMatrixPrinter("Array", val) + pretty_printers_dict[re.compile('^Eigen::Array<.*>$')] = lambda val: EigenMatrixPrinter("Array", val) + def register_eigen_printers(obj): - "Register eigen pretty-printers with objfile Obj" + """Register eigen pretty-printers with objfile Obj""" - if obj == None: + if obj is None: obj = gdb obj.pretty_printers.append(lookup_function) + def lookup_function(val): - "Look-up and return a pretty-printer that can print va." + """Look-up and return a pretty-printer that can print val.""" - type = val.type + typeinfo = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() + if typeinfo.code == gdb.TYPE_CODE_REF: + typeinfo = typeinfo.target() - type = type.unqualified().strip_typedefs() + typeinfo = typeinfo.unqualified().strip_typedefs() - typename = type.tag - if typename == None: + typename = typeinfo.tag + if typename is None: return None for function in pretty_printers_dict: @@ -309,6 +332,7 @@ def lookup_function(val): return None + pretty_printers_dict = {} -build_eigen_dictionary () +build_eigen_dictionary() diff --git a/libs/eigen/debug/lldb/eigenlldb.py b/libs/eigen/debug/lldb/eigenlldb.py new file mode 100644 index 0000000..d9b5d06 --- /dev/null +++ b/libs/eigen/debug/lldb/eigenlldb.py @@ -0,0 +1,234 @@ +# -*- coding: utf-8 -*- +# This file is part of Eigen, a lightweight C++ template library +# for linear algebra. +# +# Copyright (C) 2021 Huang, Zhaoquan +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Pretty printers for Eigen::Matrix to use with LLDB debugger +# +# Usage: +# 1. Add the following line (change it according to the path to this file) +# to the file ~/.lldbinit (create one if it doesn't exist): +# `command script import /path/to/eigenlldb.py` +# 2. Inspect the variables in LLDB command line +# `frame variable` + +import lldb +from typing import List +import bisect + + +def __lldb_init_module(debugger, internal_dict): + debugger.HandleCommand("type synthetic add -x Eigen::Matrix<.*> --python-class eigenlldb.EigenMatrixChildProvider") + debugger.HandleCommand( + "type synthetic add -x Eigen::SparseMatrix<.*> --python-class eigenlldb.EigenSparseMatrixChildProvider") + + +class EigenMatrixChildProvider: + _valobj: lldb.SBValue + _scalar_type: lldb.SBType + _scalar_size: int + _rows_compile_time: int + _cols_compile_time: int + _row_major: bool + _fixed_storage: bool + + def __init__(self, valobj, internal_dict): + self._valobj = valobj + valtype = valobj.GetType().GetCanonicalType() + + scalar_type = valtype.GetTemplateArgumentType(0) + if not scalar_type.IsValid(): + # In the case that scalar_type is invalid on LLDB 9.0 on Windows with CLion + storage = valobj.GetChildMemberWithName("m_storage") + data = storage.GetChildMemberWithName("m_data") + data_type = data.GetType() + if data_type.IsPointerType(): + scalar_type = data.GetType().GetPointeeType() + else: + scalar_type = data.GetChildMemberWithName("array").GetType().GetArrayElementType() + self._scalar_type = scalar_type + self._scalar_size = self._scalar_type.GetByteSize() + + name = valtype.GetName() + template_begin = name.find("<") + template_end = name.find(">") + template_args = name[(template_begin + 1):template_end].split(",") + self._rows_compile_time = int(template_args[1]) + self._cols_compile_time = int(template_args[2]) + self._row_major = (int(template_args[3]) & 1) != 0 + + max_rows = int(template_args[4]) + max_cols = int(template_args[5]) + self._fixed_storage = (max_rows != -1 and max_cols != -1) + + def num_children(self): + return self._cols() * self._rows() + + def get_child_index(self, name): + pass + + def get_child_at_index(self, index): + storage = self._valobj.GetChildMemberWithName("m_storage") + data = storage.GetChildMemberWithName("m_data") + offset = self._scalar_size * index + + if self._row_major: + row = index // self._cols() + col = index % self._cols() + else: + row = index % self._rows() + col = index // self._rows() + if self._fixed_storage: + data = data.GetChildMemberWithName("array") + if self._cols() == 1: + name = '[{}]'.format(row) + elif self._rows() == 1: + name = '[{}]'.format(col) + else: + name = '[{},{}]'.format(row, col) + return data.CreateChildAtOffset( + name, offset, self._scalar_type + ) + + def _cols(self): + if self._cols_compile_time == -1: + storage = self._valobj.GetChildMemberWithName("m_storage") + cols = storage.GetChildMemberWithName("m_cols") + return cols.GetValueAsUnsigned() + else: + return self._cols_compile_time + + def _rows(self): + if self._rows_compile_time == -1: + storage = self._valobj.GetChildMemberWithName("m_storage") + rows = storage.GetChildMemberWithName("m_rows") + return rows.GetValueAsUnsigned() + else: + return self._rows_compile_time + + +class EigenSparseMatrixChildProvider: + _valobj: lldb.SBValue + _scalar_type: lldb.SBType + _scalar_size: int + _index_type: lldb.SBType + _index_size: int + _row_major: bool + + _outer_size: int + _nnz: int + _values: lldb.SBValue + _inner_indices: lldb.SBValue + _outer_starts: lldb.SBValue + _inner_nnzs: lldb.SBValue + _compressed: bool + + # Index of the first synthetic child under each outer index + _child_indices: List[int] + + def __init__(self, valobj, internal_dict): + self._valobj = valobj + valtype = valobj.GetType().GetCanonicalType() + scalar_type = valtype.GetTemplateArgumentType(0) + if not scalar_type.IsValid(): + # In the case that scalar_type is invalid on LLDB 9.0 on Windows with CLion + data = valobj.GetChildMemberWithName("m_data") + values = data.GetChildMemberWithName("m_values") + scalar_type = values.GetType().GetPointeeType() + self._scalar_type = scalar_type + self._scalar_size = scalar_type.GetByteSize() + + index_type = valtype.GetTemplateArgumentType(2) + if not index_type.IsValid(): + # In the case that scalar_type is invalid on LLDB 9.0 on Windows with CLion + outer_starts = valobj.GetChildMemberWithName("m_outerIndex") + index_type = outer_starts.GetType().GetPointeeType() + self._index_type = index_type + self._index_size = index_type.GetByteSize() + + name = valtype.GetName() + template_begin = name.find("<") + template_end = name.find(">") + template_args = name[(template_begin + 1):template_end].split(",") + self._row_major = (int(template_args[1]) & 1) != 0 + + def num_children(self): + return self._nnz + 2 + + def get_child_index(self, name): + pass + + def get_child_at_index(self, index): + if index == 0: + name = "rows" if self._row_major else "cols" + return self._valobj.GetChildMemberWithName("m_outerSize") \ + .CreateChildAtOffset(name, 0, self._index_type) + elif index == 1: + name = "cols" if self._row_major else "rows" + return self._valobj.GetChildMemberWithName("m_innerSize") \ + .CreateChildAtOffset(name, 0, self._index_type) + else: + index = index - 2 + outer_index = bisect.bisect_right(self._child_indices, index) - 1 + total_nnzs = self._child_indices[outer_index] + if self._compressed: + item_index = index + inner_index = self._inner_indices \ + .CreateChildAtOffset("", item_index * self._index_size, self._index_type) \ + .GetValueAsUnsigned() + return self._values \ + .CreateChildAtOffset(self._child_name(outer_index, inner_index), + item_index * self._scalar_size, + self._scalar_type) + else: + index_begin = self._outer_starts \ + .CreateChildAtOffset("", outer_index * self._index_size, self._index_type) \ + .GetValueAsUnsigned() + item_index = index - total_nnzs + index_begin + inner_index = self._inner_indices \ + .CreateChildAtOffset("", item_index * self._index_size, self._index_type) \ + .GetValueAsUnsigned() + return self._values \ + .CreateChildAtOffset(self._child_name(outer_index, inner_index), + item_index * self._scalar_size, + self._scalar_type) + + def update(self): + valobj = self._valobj + self._outer_size = valobj.GetChildMemberWithName("m_outerSize").GetValueAsUnsigned() + data = valobj.GetChildMemberWithName("m_data") + self._values = data.GetChildMemberWithName("m_values") + self._inner_indices = data.GetChildMemberWithName("m_indices") + self._outer_starts = valobj.GetChildMemberWithName("m_outerIndex") + self._inner_nnzs = valobj.GetChildMemberWithName("m_innerNonZeros") + + self._compressed = self._inner_nnzs.GetValueAsUnsigned() == 0 + + total_nnzs = 0 + child_indices = [0] + for outer_index in range(self._outer_size): + if self._compressed: + index_end = self._outer_starts \ + .CreateChildAtOffset("", (outer_index + 1) * self._index_size, self._index_type) \ + .GetValueAsUnsigned() + total_nnzs = index_end + child_indices.append(total_nnzs) + else: + nnzs = self._inner_nnzs \ + .CreateChildAtOffset("", outer_index * self._index_size, self._index_type) \ + .GetValueAsUnsigned() + total_nnzs = total_nnzs + nnzs + child_indices.append(total_nnzs) + self._child_indices = child_indices + self._nnz = total_nnzs + + def _child_name(self, outer_index, inner_index): + if self._row_major: + return "[{0},{1}]".format(outer_index, inner_index) + else: + return "[{1},{0}]".format(outer_index, inner_index) diff --git a/libs/eigen/demos/opengl/gpuhelper.h b/libs/eigen/demos/opengl/gpuhelper.h index 9ff98e9..880e9a5 100644 --- a/libs/eigen/demos/opengl/gpuhelper.h +++ b/libs/eigen/demos/opengl/gpuhelper.h @@ -34,21 +34,21 @@ class GpuHelper Essentially, this helper function automatically calls glMatrixMode(matrixTarget) if required and does a proper call to the right glMultMatrix*() function according to the scalar type and storage order. - \warning glMatrixMode() must never be called directly. If your're unsure, use forceMatrixMode(). + \warning glMatrixMode() must never be called directly. If you are unsure, use forceMatrixMode(). \sa Matrix, loadMatrix(), forceMatrixMode() */ - template - void multMatrix(const Matrix& mat, GLenum matrixTarget); + template + void multMatrix(const Matrix& mat, GLenum matrixTarget); /** Load the matrix \a mat to the OpenGL matrix \a matrixTarget. Essentially, this helper function automatically calls glMatrixMode(matrixTarget) if required and does a proper call to the right glLoadMatrix*() or glLoadIdentity() function according to the scalar type and storage order. - \warning glMatrixMode() must never be called directly. If your're unsure, use forceMatrixMode(). + \warning glMatrixMode() must never be called directly. If you are unsure, use forceMatrixMode(). \sa Matrix, multMatrix(), forceMatrixMode() */ - template - void loadMatrix(const Eigen::Matrix& mat, GLenum matrixTarget); + template + void loadMatrix(const Eigen::Matrix& mat, GLenum matrixTarget); template void loadMatrix( @@ -66,8 +66,8 @@ class GpuHelper /** Push the OpenGL matrix \a matrixTarget and load \a mat. */ - template - inline void pushMatrix(const Matrix& mat, GLenum matrixTarget); + template + inline void pushMatrix(const Matrix& mat, GLenum matrixTarget); template void pushMatrix( @@ -113,22 +113,22 @@ extern GpuHelper gpu; /** \internal */ -template struct GlMatrixHelper; +template struct GlMatrixHelper; -template struct GlMatrixHelper +template struct GlMatrixHelper { - static void loadMatrix(const Matrix& mat) { glLoadMatrixf(mat.data()); } - static void loadMatrix(const Matrix& mat) { glLoadMatrixd(mat.data()); } - static void multMatrix(const Matrix& mat) { glMultMatrixf(mat.data()); } - static void multMatrix(const Matrix& mat) { glMultMatrixd(mat.data()); } + static void loadMatrix(const Matrix& mat) { glLoadMatrixf(mat.data()); } + static void loadMatrix(const Matrix& mat) { glLoadMatrixd(mat.data()); } + static void multMatrix(const Matrix& mat) { glMultMatrixf(mat.data()); } + static void multMatrix(const Matrix& mat) { glMultMatrixd(mat.data()); } }; -template struct GlMatrixHelper +template struct GlMatrixHelper { - static void loadMatrix(const Matrix& mat) { glLoadMatrixf(mat.transpose().eval().data()); } - static void loadMatrix(const Matrix& mat) { glLoadMatrixd(mat.transpose().eval().data()); } - static void multMatrix(const Matrix& mat) { glMultMatrixf(mat.transpose().eval().data()); } - static void multMatrix(const Matrix& mat) { glMultMatrixd(mat.transpose().eval().data()); } + static void loadMatrix(const Matrix& mat) { glLoadMatrixf(mat.transpose().eval().data()); } + static void loadMatrix(const Matrix& mat) { glLoadMatrixd(mat.transpose().eval().data()); } + static void multMatrix(const Matrix& mat) { glMultMatrixf(mat.transpose().eval().data()); } + static void multMatrix(const Matrix& mat) { glMultMatrixd(mat.transpose().eval().data()); } }; inline void GpuHelper::setMatrixTarget(GLenum matrixTarget) @@ -137,11 +137,11 @@ inline void GpuHelper::setMatrixTarget(GLenum matrixTarget) glMatrixMode(mCurrentMatrixTarget=matrixTarget); } -template -void GpuHelper::multMatrix(const Matrix& mat, GLenum matrixTarget) +template +void GpuHelper::multMatrix(const Matrix& mat, GLenum matrixTarget) { setMatrixTarget(matrixTarget); - GlMatrixHelper<_Flags&Eigen::RowMajorBit, _Flags>::multMatrix(mat); + GlMatrixHelper::multMatrix(mat); } template @@ -153,11 +153,11 @@ void GpuHelper::loadMatrix( glLoadIdentity(); } -template -void GpuHelper::loadMatrix(const Eigen::Matrix& mat, GLenum matrixTarget) +template +void GpuHelper::loadMatrix(const Eigen::Matrix& mat, GLenum matrixTarget) { setMatrixTarget(matrixTarget); - GlMatrixHelper<(_Flags&Eigen::RowMajorBit)!=0, _Flags>::loadMatrix(mat); + GlMatrixHelper<(Flags_&Eigen::RowMajorBit)!=0, Flags_>::loadMatrix(mat); } inline void GpuHelper::pushMatrix(GLenum matrixTarget) @@ -166,11 +166,11 @@ inline void GpuHelper::pushMatrix(GLenum matrixTarget) glPushMatrix(); } -template -inline void GpuHelper::pushMatrix(const Matrix& mat, GLenum matrixTarget) +template +inline void GpuHelper::pushMatrix(const Matrix& mat, GLenum matrixTarget) { pushMatrix(matrixTarget); - GlMatrixHelper<_Flags&Eigen::RowMajorBit,_Flags>::loadMatrix(mat); + GlMatrixHelper::loadMatrix(mat); } template diff --git a/libs/eigen/demos/opengl/quaternion_demo.cpp b/libs/eigen/demos/opengl/quaternion_demo.cpp index dd323a4..531448d 100644 --- a/libs/eigen/demos/opengl/quaternion_demo.cpp +++ b/libs/eigen/demos/opengl/quaternion_demo.cpp @@ -132,11 +132,11 @@ inline static Frame lerpFrame(float alpha, const Frame& a, const Frame& b) Quaternionf(lerp(alpha,OrientationType(a.orientation),OrientationType(b.orientation)))); } -template class EulerAngles +template class EulerAngles { public: enum { Dim = 3 }; - typedef _Scalar Scalar; + typedef Scalar_ Scalar; typedef Matrix Matrix3; typedef Matrix Vector3; typedef Quaternion QuaternionType; diff --git a/libs/eigen/doc/CMakeLists.txt b/libs/eigen/doc/CMakeLists.txt index 0f9ef23..e7eaa4b 100644 --- a/libs/eigen/doc/CMakeLists.txt +++ b/libs/eigen/doc/CMakeLists.txt @@ -10,9 +10,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif() endif() -# some examples and snippets needs c++11, so let's check it once -check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11) - option(EIGEN_INTERNAL_DOCUMENTATION "Build internal documentation" OFF) option(EIGEN_DOC_USE_MATHJAX "Use MathJax for rendering math in HTML docs" ON) diff --git a/libs/eigen/doc/Doxyfile.in b/libs/eigen/doc/Doxyfile.in index bc1e03c..d0e96fa 100644 --- a/libs/eigen/doc/Doxyfile.in +++ b/libs/eigen/doc/Doxyfile.in @@ -1600,8 +1600,6 @@ PREDEFINED = EIGEN_EMPTY_STRUCT \ EIGEN_QT_SUPPORT \ EIGEN_STRONG_INLINE=inline \ EIGEN_DEVICE_FUNC= \ - EIGEN_HAS_CXX11=1 \ - EIGEN_HAS_CXX11_MATH=1 \ "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template const CwiseBinaryOp, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const;" \ "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp, const LHS, const RHS>"\ "EIGEN_CAT2(a,b)= a ## b"\ diff --git a/libs/eigen/doc/InsideEigenExample.dox b/libs/eigen/doc/InsideEigenExample.dox index ea2275b..4af185d 100644 --- a/libs/eigen/doc/InsideEigenExample.dox +++ b/libs/eigen/doc/InsideEigenExample.dox @@ -88,7 +88,7 @@ You may wonder, isn't it overengineering to have the storage in a separate class Let's look at this constructor, in src/Core/DenseStorage.h. You can see that there are many partial template specializations of DenseStorages here, treating separately the cases where dimensions are Dynamic or fixed at compile-time. The partial specialization that we are looking at is: \code -template class DenseStorage +template class DenseStorage \endcode Here, the constructor called is DenseStorage::DenseStorage(int size, int rows, int columns) @@ -101,7 +101,7 @@ inline DenseStorage(int size, int rows, int) : m_data(internal::aligned_new(s Here, the \a m_data member is the actual array of coefficients of the matrix. As you see, it is dynamically allocated. Rather than calling new[] or malloc(), as you can see, we have our own internal::aligned_new defined in src/Core/util/Memory.h. What it does is that if vectorization is enabled, then it uses a platform-specific call to allocate a 128-bit-aligned array, as that is very useful for vectorization with both SSE2 and AltiVec. If vectorization is disabled, it amounts to the standard new[]. -As you can see, the constructor also sets the \a m_rows member to \a size. Notice that there is no \a m_columns member: indeed, in this partial specialization of DenseStorage, we know the number of columns at compile-time, since the _Cols template parameter is different from Dynamic. Namely, in our case, _Cols is 1, which is to say that our vector is just a matrix with 1 column. Hence, there is no need to store the number of columns as a runtime variable. +As you can see, the constructor also sets the \a m_rows member to \a size. Notice that there is no \a m_columns member: indeed, in this partial specialization of DenseStorage, we know the number of columns at compile-time, since the Cols_ template parameter is different from Dynamic. Namely, in our case, Cols_ is 1, which is to say that our vector is just a matrix with 1 column. Hence, there is no need to store the number of columns as a runtime variable. When you call VectorXf::data() to get the pointer to the array of coefficients, it returns DenseStorage::data() which returns the \a m_data member. diff --git a/libs/eigen/doc/PreprocessorDirectives.dox b/libs/eigen/doc/PreprocessorDirectives.dox index 0f545b0..d6024dc 100644 --- a/libs/eigen/doc/PreprocessorDirectives.dox +++ b/libs/eigen/doc/PreprocessorDirectives.dox @@ -55,29 +55,15 @@ By default, %Eigen strive to automatically detect and enable language features a the information provided by the compiler. - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER. - Possible values are: 03, 11, 14, 17, etc. If not defined (the default), %Eigen enables all features supported + Possible values are: 14, 17, etc. If not defined (the default), %Eigen enables all features supported by the compiler. Individual features can be explicitly enabled or disabled by defining the following token to 0 or 1 respectively. -For instance, one might limit the C++ version to C++03 by defining EIGEN_MAX_CPP_VER=03, but still enable C99 math +For instance, one might limit the C++ version to C++14 by defining EIGEN_MAX_CPP_VER=14, but still enable C99 math functions by defining EIGEN_HAS_C99_MATH=1. - \b EIGEN_HAS_C99_MATH - controls the usage of C99 math functions such as erf, erfc, lgamma, etc. - Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - - \b EIGEN_HAS_CXX11_MATH - controls the implementation of some functions such as round, logp1, isinf, isnan, etc. - Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - - \b EIGEN_HAS_RVALUE_REFERENCES - defines whether rvalue references are supported - Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - \b EIGEN_HAS_STD_RESULT_OF - defines whether std::result_of is supported - Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - - \b EIGEN_HAS_VARIADIC_TEMPLATES - defines whether variadic templates are supported - Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - - \b EIGEN_HAS_CONSTEXPR - defines whether relaxed const expression are supported - Automatic detection disabled if EIGEN_MAX_CPP_VER<14. - - \b EIGEN_HAS_CXX11_CONTAINERS - defines whether STL's containers follows C++11 specifications - Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - - \b EIGEN_HAS_CXX11_NOEXCEPT - defines whether noexcept is supported - Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - \b EIGEN_NO_IO - Disables any usage and support for ``. \section TopicPreprocessorDirectivesAssertions Assertions @@ -104,7 +90,7 @@ run time. However, these assertions do cost time and can thus be turned off. - \b \c EIGEN_MAX_ALIGN_BYTES - Must be a power of two, or 0. Defines an upper bound on the memory boundary in bytes on which dynamically and statically allocated data may be aligned by %Eigen. If not defined, a default value is automatically computed based on architecture, compiler, and OS. This option is typically used to enforce binary compatibility between code/libraries compiled with different SIMD options. For instance, one may compile AVX code and enforce ABI compatibility with existing SSE code by defining \c EIGEN_MAX_ALIGN_BYTES=16. In the other way round, since by default AVX implies 32 bytes alignment for best performance, one can compile SSE code to be ABI compatible with AVX code by defining \c EIGEN_MAX_ALIGN_BYTES=32. - \b \c EIGEN_MAX_STATIC_ALIGN_BYTES - Same as \c EIGEN_MAX_ALIGN_BYTES but for statically allocated data only. By default, if only \c EIGEN_MAX_ALIGN_BYTES is defined, then \c EIGEN_MAX_STATIC_ALIGN_BYTES == \c EIGEN_MAX_ALIGN_BYTES, otherwise a default value is automatically computed based on architecture, compiler, and OS (can be smaller than the default value of EIGEN_MAX_ALIGN_BYTES on architectures that do not support stack alignment). - Let us emphasize that \c EIGEN_MAX_*_ALIGN_BYTES define only a diserable upper bound. In practice data is aligned to largest power-of-two common divisor of \c EIGEN_MAX_STATIC_ALIGN_BYTES and the size of the data, such that memory is not wasted. + Let us emphasize that \c EIGEN_MAX_*_ALIGN_BYTES define only a desirable upper bound. In practice data is aligned to largest power-of-two common divisor of \c EIGEN_MAX_STATIC_ALIGN_BYTES and the size of the data, such that memory is not wasted. - \b \c EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP. See \ref TopicMultiThreading for details. - \b \c EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless @@ -131,8 +117,11 @@ run time. However, these assertions do cost time and can thus be turned off. - \b \c EIGEN_DEFAULT_L2_CACHE_SIZE - Sets the default L2 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime. - \b \c EIGEN_DEFAULT_L3_CACHE_SIZE - Sets the default L3 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime. - - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b \c EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default. - - \c EIGEN_DONT_ALIGN_STATICALLY - Deprecated, it is a synonym for \c EIGEN_MAX_STATIC_ALIGN_BYTES=0. It disables alignment of arrays on the stack. Not defined by default, unless \c EIGEN_DONT_ALIGN is defined. + - \b \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b \c EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default. + - \b \c EIGEN_DONT_ALIGN_STATICALLY - Deprecated, it is a synonym for \c EIGEN_MAX_STATIC_ALIGN_BYTES=0. It disables alignment of arrays on the stack. Not defined by default, unless \c EIGEN_DONT_ALIGN is defined. + - \b \c EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH - Controls whether to use Eigen's dynamic dispatching for Altivec MMA or not. + - \b \c EIGEN_ALTIVEC_DISABLE_MMA - Overrides the usage of Altivec MMA instructions. + - \b \c EIGEN_ALTIVEC_USE_CUSTOM_PACK - Controls whether to use Eigen's custom packing for Altivec or not. \section TopicPreprocessorDirectivesPlugins Plugins diff --git a/libs/eigen/doc/QuickReference.dox b/libs/eigen/doc/QuickReference.dox index c5dfce4..e96b617 100644 --- a/libs/eigen/doc/QuickReference.dox +++ b/libs/eigen/doc/QuickReference.dox @@ -367,7 +367,8 @@ vec2 = vec1.normalized(); vec1.normalize(); // inplace \endcode \link MatrixBase::cross() cross product \endlink \matrixworld\code #include -vec3 = vec1.cross(vec2);\endcode +v3c = v3a.cross(v3b); // size-3 vectors +scalar = v2a.cross(v2b); // size-2 vectors \endcode top diff --git a/libs/eigen/doc/QuickStartGuide.dox b/libs/eigen/doc/QuickStartGuide.dox index 4192b28..0372694 100644 --- a/libs/eigen/doc/QuickStartGuide.dox +++ b/libs/eigen/doc/QuickStartGuide.dox @@ -22,11 +22,11 @@ We will explain the program after telling you how to compile it. \section GettingStartedCompiling Compiling and running your first program -There is no library to link to. The only thing that you need to keep in mind when compiling the above program is that the compiler must be able to find the Eigen header files. The directory in which you placed Eigen's source code must be in the include path. With GCC you use the -I option to achieve this, so you can compile the program with a command like this: +There is no library to link to. The only thing that you need to keep in mind when compiling the above program is that the compiler must be able to find the Eigen header files. The directory in which you placed Eigen's source code must be in the include path. With GCC you use the \c -I option to achieve this, so you can compile the program with a command like this: \code g++ -I /path/to/eigen/ my_program.cpp -o my_program \endcode -On Linux or Mac OS X, another option is to symlink or copy the Eigen folder into /usr/local/include/. This way, you can compile the program with: +On Linux or Mac OS X, another option is to symlink or copy the Eigen folder into \c /usr/local/include/. This way, you can compile the program with: \code g++ my_program.cpp -o my_program \endcode diff --git a/libs/eigen/doc/SparseLinearSystems.dox b/libs/eigen/doc/SparseLinearSystems.dox index 38754e4..0135ee2 100644 --- a/libs/eigen/doc/SparseLinearSystems.dox +++ b/libs/eigen/doc/SparseLinearSystems.dox @@ -13,24 +13,20 @@ They are summarized in the following tables: - + - - - -
ClassSolver kindMatrix kindFeatures related to performanceLicense

Notes

Notes

SimplicialLLT \n \#includeDirect LLt factorizationSPDFill-in reducingLGPL SimplicialLDLT is often preferable
SimplicialLDLT \n \#includeDirect LDLt factorizationSPDFill-in reducingLGPL Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)
SparseLU \n \#include LU factorization Square Fill-in reducing, Leverage fast dense algebraMPL2 optimized for small and large problems with irregular patterns
SparseQR \n \#include QR factorization Any, rectangular Fill-in reducingMPL2 recommended for least-square problems, has a basic rank-revealing feature
@@ -38,21 +34,18 @@ They are summarized in the following tables: - + - - - + -
ClassSolver kindMatrix kindSupported preconditioners, [default]License

Notes

Notes

ConjugateGradient \n \#include Classic iterative CGSPD IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholeskyMPL2 Recommended for large symmetric problems (e.g., 3D Poisson eq.)
LeastSquaresConjugateGradient \n \#includeCG for rectangular least-square problemRectangular IdentityPreconditioner, [LeastSquareDiagonalPreconditioner]MPL2Solve for min |A'Ax-b|^2 without forming A'A
Solve for min |Ax-b|^2 without forming A'A
BiCGSTAB \n \#includeIterative stabilized bi-conjugate gradientSquare IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUTMPL2 To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.
@@ -82,6 +75,9 @@ They are summarized in the following tables: PardisoLLT \n PardisoLDLT \n PardisoLU\link PardisoSupport_Module PardisoSupport \endlinkDirect LLt, LDLt, LU factorizationsSPD \n SPD \n SquareFill-in reducing, Leverage fast dense algebra, Multithreading Requires the Intel MKL package, \b Proprietary optimized for tough problems patterns, see also \link TopicUsingIntelMKL using MKL with Eigen \endlink +AccelerateLLT \n AccelerateLDLT \n AccelerateQR\link AccelerateSupport_Module AccelerateSupport \endlinkDirect LLt, LDLt, QR factorizationsSPD \n SPD \n RectangularFill-in reducing, Leverage fast dense algebra, Multithreading + Requires the Apple Accelerate package, \b Proprietary + Here \c SPD means symmetric positive definite. @@ -137,7 +133,7 @@ x1 = solver.solve(b1); x2 = solver.solve(b2); ... \endcode -The compute() method is equivalent to calling both analyzePattern() and factorize(). +The `compute()` method is equivalent to calling both `analyzePattern()` and `factorize()`. Each solver provides some specific features, such as determinant, access to the factors, controls of the iterations, and so on. More details are available in the documentations of the respective classes. @@ -145,9 +141,9 @@ More details are available in the documentations of the respective classes. Finally, most of the iterative solvers, can also be used in a \b matrix-free context, see the following \link MatrixfreeSolverExample example \endlink. \section TheSparseCompute The Compute Step -In the compute() function, the matrix is generally factorized: LLT for self-adjoint matrices, LDLT for general hermitian matrices, LU for non hermitian matrices and QR for rectangular matrices. These are the results of using direct solvers. For this class of solvers precisely, the compute step is further subdivided into analyzePattern() and factorize(). +In the `compute()` function, the matrix is generally factorized: LLT for self-adjoint matrices, LDLT for general hermitian matrices, LU for non hermitian matrices and QR for rectangular matrices. These are the results of using direct solvers. For this class of solvers precisely, the compute step is further subdivided into `analyzePattern()` and `factorize()`. -The goal of analyzePattern() is to reorder the nonzero elements of the matrix, such that the factorization step creates less fill-in. This step exploits only the structure of the matrix. Hence, the results of this step can be used for other linear systems where the matrix has the same structure. Note however that sometimes, some external solvers (like SuperLU) require that the values of the matrix are set in this step, for instance to equilibrate the rows and columns of the matrix. In this situation, the results of this step should not be used with other matrices. +The goal of `analyzePattern()` is to reorder the nonzero elements of the matrix, such that the factorization step creates less fill-in. This step exploits only the structure of the matrix. Hence, the results of this step can be used for other linear systems where the matrix has the same structure. Note however that sometimes, some external solvers (like SuperLU) require that the values of the matrix are set in this step, for instance to equilibrate the rows and columns of the matrix. In this situation, the results of this step should not be used with other matrices. Eigen provides a limited set of methods to reorder the matrix in this step, either built-in (COLAMD, AMD) or external (METIS). These methods are set in template parameter list of the solver : \code @@ -156,21 +152,21 @@ DirectSolverClassName, OrderingMethod > solver; See the \link OrderingMethods_Module OrderingMethods module \endlink for the list of available methods and the associated options. -In factorize(), the factors of the coefficient matrix are computed. This step should be called each time the values of the matrix change. However, the structural pattern of the matrix should not change between multiple calls. +In `factorize()`, the factors of the coefficient matrix are computed. This step should be called each time the values of the matrix change. However, the structural pattern of the matrix should not change between multiple calls. For iterative solvers, the compute step is used to eventually setup a preconditioner. For instance, with the ILUT preconditioner, the incomplete factors L and U are computed in this step. Remember that, basically, the goal of the preconditioner is to speedup the convergence of an iterative method by solving a modified linear system where the coefficient matrix has more clustered eigenvalues. For real problems, an iterative solver should always be used with a preconditioner. In Eigen, a preconditioner is selected by simply adding it as a template parameter to the iterative solver object. \code IterativeSolverClassName, PreconditionerName > solver; \endcode -The member function preconditioner() returns a read-write reference to the preconditioner +The member function `preconditioner()` returns a read-write reference to the preconditioner to directly interact with it. See the \link IterativeLinearSolvers_Module Iterative solvers module \endlink and the documentation of each class for the list of available methods. \section TheSparseSolve The Solve step -The solve() function computes the solution of the linear systems with one or many right hand sides. +The `solve()` function computes the solution of the linear systems with one or many right hand sides. \code X = solver.solve(B); \endcode -Here, B can be a vector or a matrix where the columns form the different right hand sides. The solve() function can be called several times as well, for instance when all the right hand sides are not available at once. +Here, B can be a vector or a matrix where the columns form the different right hand sides. `The solve()` function can be called several times as well, for instance when all the right hand sides are not available at once. \code x1 = solver.solve(b1); // Get the second right hand side b2 @@ -180,7 +176,7 @@ x2 = solver.solve(b2); For direct methods, the solution are computed at the machine precision. Sometimes, the solution need not be too accurate. In this case, the iterative methods are more suitable and the desired accuracy can be set before the solve step using \b setTolerance(). For all the available functions, please, refer to the documentation of the \link IterativeLinearSolvers_Module Iterative solvers module \endlink. \section BenchmarkRoutine -Most of the time, all you need is to know how much time it will take to solve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in MatrixMarket Coordinate format, and the routine returns the statistics from all available solvers in Eigen. +Most of the time, all you need is to know how much time it will take to solve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to `bench/spbench` and compile the routine by typing `make spbenchsolver`. Run it with `--help` option to get the list of all available options. Basically, the matrices to test should be in MatrixMarket Coordinate format, and the routine returns the statistics from all available solvers in Eigen. To export your matrices and right-hand-side vectors in the matrix-market format, you can the the unsupported SparseExtra module: \code diff --git a/libs/eigen/doc/SparseQuickReference.dox b/libs/eigen/doc/SparseQuickReference.dox index 9779f3f..b8264a4 100644 --- a/libs/eigen/doc/SparseQuickReference.dox +++ b/libs/eigen/doc/SparseQuickReference.dox @@ -249,7 +249,7 @@ sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector \endcode -If the matrix is not in compressed form, makeCompressed() should be called before.\n +If the matrix is not in compressed form, `makeCompressed()` should be called before.\n Note that these functions are mostly provided for interoperability purposes with external libraries.\n A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section diff --git a/libs/eigen/doc/StructHavingEigenMembers.dox b/libs/eigen/doc/StructHavingEigenMembers.dox index 87016cd..4c97093 100644 --- a/libs/eigen/doc/StructHavingEigenMembers.dox +++ b/libs/eigen/doc/StructHavingEigenMembers.dox @@ -80,7 +80,7 @@ But AVX instructions (at least the ones that %Eigen uses, which are the fast one Otherwise you get a segmentation fault. For this reason, %Eigen takes care by itself to require 256-bit alignment for Eigen::Vector4d, by doing two things: -\li %Eigen requires 256-bit alignment for the Eigen::Vector4d's array (of 4 doubles). With \cpp11 this is done with the alignas keyword, or compiler's extensions for c++98/03. +\li %Eigen requires 256-bit alignment for the Eigen::Vector4d's array (of 4 doubles). This is done with the alignas keyword. \li %Eigen overloads the `operator new` of Eigen::Vector4d so it will always return 256-bit aligned pointers. (removed in \cpp17) Thus, normally, you don't have to worry about anything, %Eigen handles alignment of operator new for you... diff --git a/libs/eigen/doc/TopicLinearAlgebraDecompositions.dox b/libs/eigen/doc/TopicLinearAlgebraDecompositions.dox index 402b376..8598ce6 100644 --- a/libs/eigen/doc/TopicLinearAlgebraDecompositions.dox +++ b/libs/eigen/doc/TopicLinearAlgebraDecompositions.dox @@ -272,7 +272,7 @@ To get an overview of the true relative speed of the different decompositions, c
Blocking
Means the algorithm can work per block, whence guaranteeing a good scaling of the performance for large matrices.
Implicit Multi Threading (MT)
-
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.
+
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algorithm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.
Explicit Multi Threading (MT)
Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.
Meta-unroller
diff --git a/libs/eigen/doc/TutorialMatrixArithmetic.dox b/libs/eigen/doc/TutorialMatrixArithmetic.dox index 5fc569a..53916c2 100644 --- a/libs/eigen/doc/TutorialMatrixArithmetic.dox +++ b/libs/eigen/doc/TutorialMatrixArithmetic.dox @@ -158,7 +158,7 @@ For dot product and cross product, you need the \link MatrixBase::dot() dot()\en \verbinclude tut_arithmetic_dot_cross.out -Remember that cross product is only for vectors of size 3. Dot product is for vectors of any sizes. +Cross product is defined in Eigen not only for vectors of size 3 but also for those of size 2, check \link MatrixBase::cross() the doc\endlink for details. Dot product is for vectors of any sizes. When using complex numbers, Eigen's dot product is conjugate-linear in the first variable and linear in the second variable. diff --git a/libs/eigen/doc/TutorialMatrixClass.dox b/libs/eigen/doc/TutorialMatrixClass.dox index 2c45222..e4e4f98 100644 --- a/libs/eigen/doc/TutorialMatrixClass.dox +++ b/libs/eigen/doc/TutorialMatrixClass.dox @@ -111,9 +111,9 @@ Vector4d c(5.0, 6.0, 7.0, 8.0); If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients: \code -Vector2i a(1, 2); // A column vector containing the elements {1, 2} -Matrix b {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} -Matrix c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5} +Vector2i a(1, 2); // A column-vector containing the elements {1, 2} +Matrix b {1, 2, 3, 4, 5}; // A column-vector containing the elements {1, 2, 3, 4, 5} +Matrix c = {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} \endcode In the general case of matrices and vectors with either fixed or runtime sizes, @@ -151,14 +151,14 @@ The numbering starts at 0. This example is self-explanatory: \verbinclude tut_matrix_coefficient_accessors.out -Note that the syntax m(index) +Note that the syntax `m(index)` is not restricted to vectors, it is also available for general matrices, meaning index-based access in the array of coefficients. This however depends on the matrix's storage order. All Eigen matrices default to column-major storage order, but this can be changed to row-major, see \ref TopicStorageOrders "Storage orders". -The operator[] is also overloaded for index-based access in vectors, but keep in mind that C++ doesn't allow operator[] to -take more than one argument. We restrict operator[] to vectors, because an awkwardness in the C++ language -would make matrix[i,j] compile to the same thing as matrix[j] ! +The `operator[]` is also overloaded for index-based access in vectors, but keep in mind that C++ doesn't allow `operator[]` to +take more than one argument. We restrict `operator[]` to vectors, because an awkwardness in the C++ language +would make `matrix[i,j]` compile to the same thing as `matrix[j]`! \section TutorialMatrixCommaInitializer Comma-initialization @@ -186,8 +186,8 @@ The current size of a matrix can be retrieved by \link EigenBase::rows() rows()\ \verbinclude tut_matrix_resize.out -The resize() method is a no-operation if the actual matrix size doesn't change; otherwise it is destructive: the values of the coefficients may change. -If you want a conservative variant of resize() which does not change the coefficients, use \link PlainObjectBase::conservativeResize() conservativeResize()\endlink, see \ref TopicResizing "this page" for more details. +The `resize()` method is a no-operation if the actual matrix size doesn't change; otherwise it is destructive: the values of the coefficients may change. +If you want a conservative variant of `resize()` which does not change the coefficients, use \link PlainObjectBase::conservativeResize() conservativeResize()\endlink, see \ref TopicResizing "this page" for more details. All these methods are still available on fixed-size matrices, for the sake of API uniformity. Of course, you can't actually resize a fixed-size matrix. Trying to change a fixed size to an actually different value will trigger an assertion failure; @@ -234,7 +234,7 @@ is always allocated on the heap, so doing \code MatrixXf mymatrix(rows,columns); \endcode amounts to doing \code float *mymatrix = new float[rows*columns]; \endcode -and in addition to that, the MatrixXf object stores its number of rows and columns as +and in addition to that, the \c MatrixXf object stores its number of rows and columns as member variables. The limitation of using fixed sizes, of course, is that this is only possible @@ -276,14 +276,16 @@ Matrix. For example, MatrixXi for Matrix. -\li VectorNt for Matrix. For example, Vector2f for Matrix. -\li RowVectorNt for Matrix. For example, RowVector3d for Matrix. +\li \c MatrixNt for `Matrix`. For example, \c MatrixXi for `Matrix`. +\li \c MatrixXNt for `Matrix`. For example, \c MatrixX3i for `Matrix`. +\li \c MatrixNXt for `Matrix`. For example, \c Matrix4Xd for `Matrix`. +\li \c VectorNt for `Matrix`. For example, \c Vector2f for `Matrix`. +\li \c RowVectorNt for `Matrix`. For example, \c RowVector3d for `Matrix`. Where: -\li N can be any one of \c 2, \c 3, \c 4, or \c X (meaning \c Dynamic). -\li t can be any one of \c i (meaning int), \c f (meaning float), \c d (meaning double), - \c cf (meaning complex), or \c cd (meaning complex). The fact that typedefs are only +\li \c N can be any one of \c 2, \c 3, \c 4, or \c X (meaning \c Dynamic). +\li \c t can be any one of \c i (meaning \c int), \c f (meaning \c float), \c d (meaning \c double), + \c cf (meaning `complex`), or \c cd (meaning `complex`). The fact that `typedef`s are only defined for these five types doesn't mean that they are the only supported scalar types. For example, all standard integer types are supported, see \ref TopicScalarTypes "Scalar types". diff --git a/libs/eigen/doc/TutorialReshape.dox b/libs/eigen/doc/TutorialReshape.dox index 5b4022a..07e5c3c 100644 --- a/libs/eigen/doc/TutorialReshape.dox +++ b/libs/eigen/doc/TutorialReshape.dox @@ -3,7 +3,7 @@ namespace Eigen { /** \eigenManualPage TutorialReshape Reshape Since the version 3.4, %Eigen exposes convenient methods to reshape a matrix to another matrix of different sizes or vector. -All cases are handled via the DenseBase::reshaped(NRowsType,NColsType) and DenseBase::reshaped() functions. +All cases are handled via the `DenseBase::reshaped(NRowsType,NColsType)` and `DenseBase::reshaped()` functions. Those functions do not perform in-place reshaping, but instead return a view on the input expression. \eigenAutoToc @@ -23,7 +23,7 @@ Here is an example reshaping a 4x4 matrix to a 2x8 one: By default, the input coefficients are always interpreted in column-major order regardless of the storage order of the input expression. -For more control on ordering, compile-time sizes, and automatic size deduction, please see de documentation of DenseBase::reshaped(NRowsType,NColsType) that contains all the details with many examples. +For more control on ordering, compile-time sizes, and automatic size deduction, please see de documentation of `DenseBase::reshaped(NRowsType,NColsType)` that contains all the details with many examples. \section TutorialReshapeMat2Vec 1D linear views diff --git a/libs/eigen/doc/TutorialSlicingIndexing.dox b/libs/eigen/doc/TutorialSlicingIndexing.dox index 98ace43..8b067df 100644 --- a/libs/eigen/doc/TutorialSlicingIndexing.dox +++ b/libs/eigen/doc/TutorialSlicingIndexing.dox @@ -15,7 +15,7 @@ All the aforementioned operations are handled through the generic DenseBase::ope Each argument can be: - An integer indexing a single row or column, including symbolic indices. - The symbol Eigen::all representing the whole set of respective rows or columns in increasing order. - - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::lastN functions. + - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::placeholders::lastN functions. - Any 1D vector/array of integers including %Eigen's vector/array, expressions, std::vector, std::array, as well as plain C arrays: `int[N]`. More generally, it can accepts any object exposing the following two member functions: @@ -72,7 +72,7 @@ Here are some examples for a 2D array/matrix \c A and a 1D array/vector \c v. %Block starting at \c i,j having \c m rows, and \c n columns - \code A(seqN(i,m), seqN(i,n) \endcode + \code A(seqN(i,m), seqN(i,n)) \endcode \code A.block(i,j,m,n) \endcode @@ -112,9 +112,10 @@ Here are some examples for a 2D array/matrix \c A and a 1D array/vector \c v. -As seen in the last exemple, referencing the last n elements (or rows/columns) is a bit cumbersome to write. +As seen in the last example, referencing the last n elements (or rows/columns) is a bit cumbersome to write. This becomes even more tricky and error prone with a non-default increment. -Here comes \link Eigen::lastN(SizeType) Eigen::lastN(size) \endlink, and \link Eigen::lastN(SizeType,IncrType) Eigen::lastN(size,incr) \endlink: +Here comes \link Eigen::placeholders::lastN(SizeType) Eigen::placeholders::lastN(size) \endlink, and +\link Eigen::placeholders::lastN(SizeType,IncrType) Eigen::placeholders::lastN(size,incr) \endlink: @@ -129,12 +130,12 @@ Here comes \link Eigen::lastN(SizeType) Eigen::lastN(size) \endlink, and \link E - + - + @@ -221,7 +222,7 @@ i = ind[i]; \endcode This means you can easily build your own fancy sequence generator and pass it to `operator()`. -Here is an exemple enlarging a given matrix while padding the additional first rows and columns through repetition: +Here is an example enlarging a given matrix while padding the additional first rows and columns through repetition:
Bottom-right corner of A of size \c m times \c n\code v(lastN(m), lastN(n)) \endcode\code A(lastN(m), lastN(n)) \endcode \code A.bottomRightCorner(m,n) \endcode
Bottom-right corner of A of size \c m times \c n\code v(lastN(m), lastN(n)) \endcode\code A(lastN(m), lastN(n)) \endcode \code A.bottomRightCorner(m,n) \endcode
diff --git a/libs/eigen/doc/TutorialSparse.dox b/libs/eigen/doc/TutorialSparse.dox index c69171e..a00bacd 100644 --- a/libs/eigen/doc/TutorialSparse.dox +++ b/libs/eigen/doc/TutorialSparse.dox @@ -44,8 +44,8 @@ This storage scheme is better explained on an example. The following matrix and one of its possible sparse, \b column \b major representation:
Example:Output:
- - + +
Values: 227_3514__1_178
InnerIndices: 12_02 4__2_ 14
Values: 227_35_14_1_178
InnerIndices: 12_02_4_2_ 14
@@ -54,13 +54,13 @@ and one of its possible sparse, \b column \b major representation: Currently the elements of a given inner vector are guaranteed to be always sorted by increasing inner indices. The \c "_" indicates available free space to quickly insert new elements. -Assuming no reallocation is needed, the insertion of a random element is therefore in O(nnz_j) where nnz_j is the number of nonzeros of the respective inner vector. -On the other hand, inserting elements with increasing inner indices in a given inner vector is much more efficient since this only requires to increase the respective \c InnerNNZs entry that is a O(1) operation. +Assuming no reallocation is needed, the insertion of a random element is therefore in `O(nnz_j)` where `nnz_j` is the number of nonzeros of the respective inner vector. +On the other hand, inserting elements with increasing inner indices in a given inner vector is much more efficient since this only requires to increase the respective \c InnerNNZs entry that is a `O(1)` operation. The case where no empty space is available is a special case, and is referred as the \em compressed mode. It corresponds to the widely used Compressed Column (or Row) Storage schemes (CCS or CRS). Any SparseMatrix can be turned to this form by calling the SparseMatrix::makeCompressed() function. -In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we have the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j]. +In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we have the equality: `InnerNNZs[j] == OuterStarts[j+1] - OuterStarts[j]`. Therefore, in practice a call to SparseMatrix::makeCompressed() frees this buffer. It is worth noting that most of our wrappers to external libraries requires compressed matrices as inputs. @@ -221,9 +221,9 @@ A typical scenario of this approach is illustrated below: 5: mat.makeCompressed(); // optional \endcode -- The key ingredient here is the line 2 where we reserve room for 6 non-zeros per column. In many cases, the number of non-zeros per column or row can easily be known in advance. If it varies significantly for each inner vector, then it is possible to specify a reserve size for each inner vector by providing a vector object with an operator[](int j) returning the reserve size of the \c j-th inner vector (e.g., via a VectorXi or std::vector). If only a rought estimate of the number of nonzeros per inner-vector can be obtained, it is highly recommended to overestimate it rather than the opposite. If this line is omitted, then the first insertion of a new element will reserve room for 2 elements per inner vector. +- The key ingredient here is the line 2 where we reserve room for 6 non-zeros per column. In many cases, the number of non-zeros per column or row can easily be known in advance. If it varies significantly for each inner vector, then it is possible to specify a reserve size for each inner vector by providing a vector object with an `operator[](int j)` returning the reserve size of the \c j-th inner vector (e.g., via a `VectorXi` or `std::vector`). If only a rought estimate of the number of nonzeros per inner-vector can be obtained, it is highly recommended to overestimate it rather than the opposite. If this line is omitted, then the first insertion of a new element will reserve room for 2 elements per inner vector. - The line 4 performs a sorted insertion. In this example, the ideal case is when the \c j-th column is not full and contains non-zeros whose inner-indices are smaller than \c i. In this case, this operation boils down to trivial O(1) operation. -- When calling insert(i,j) the element \c i \c ,j must not already exists, otherwise use the coeffRef(i,j) method that will allow to, e.g., accumulate values. This method first performs a binary search and finally calls insert(i,j) if the element does not already exist. It is more flexible than insert() but also more costly. +- When calling `insert(i,j)` the element `i`, `j` must not already exists, otherwise use the `coeffRef(i,j)` method that will allow to, e.g., accumulate values. This method first performs a binary search and finally calls `insert(i,j)` if the element does not already exist. It is more flexible than `insert()` but also more costly. - The line 5 suppresses the remaining empty space and transforms the matrix into a compressed column storage. @@ -259,7 +259,7 @@ sm2 = sm1.cwiseProduct(dm1); dm2 = sm1 + dm1; dm2 = dm1 - sm1; \endcode -Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing dm2 = sm1 + dm1, better write: +Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing `dm2 = sm1 + dm1`, better write: \code dm2 = dm1; dm2 += sm1; @@ -272,7 +272,7 @@ This version has the advantage to fully exploit the higher performance of dense sm1 = sm2.transpose(); sm1 = sm2.adjoint(); \endcode -However, there is no transposeInPlace() method. +However, there is no `transposeInPlace()` method. \subsection TutorialSparse_Products Matrix products @@ -284,18 +284,18 @@ dv2 = sm1 * dv1; dm2 = dm1 * sm1.adjoint(); dm2 = 2. * sm1 * dm1; \endcode - - \b symmetric \b sparse-dense. The product of a sparse symmetric matrix with a dense matrix (or vector) can also be optimized by specifying the symmetry with selfadjointView(): + - \b symmetric \b sparse-dense. The product of a sparse symmetric matrix with a dense matrix (or vector) can also be optimized by specifying the symmetry with `selfadjointView()`: \code -dm2 = sm1.selfadjointView<>() * dm1; // if all coefficients of A are stored -dm2 = A.selfadjointView() * dm1; // if only the upper part of A is stored -dm2 = A.selfadjointView() * dm1; // if only the lower part of A is stored +dm2 = sm1.selfadjointView<>() * dm1; // if all coefficients of sm1 are stored +dm2 = sm1.selfadjointView() * dm1; // if only the upper part of sm1 is stored +dm2 = sm1.selfadjointView() * dm1; // if only the lower part of sm1 is stored \endcode - \b sparse-sparse. For sparse-sparse products, two different algorithms are available. The default one is conservative and preserve the explicit zeros that might appear: \code sm3 = sm1 * sm2; sm3 = 4 * sm1.adjoint() * sm2; \endcode - The second algorithm prunes on the fly the explicit zeros, or the values smaller than a given threshold. It is enabled and controlled through the prune() functions: + The second algorithm prunes on the fly the explicit zeros, or the values smaller than a given threshold. It is enabled and controlled through the `prune()` functions: \code sm3 = (sm1 * sm2).pruned(); // removes numerical zeros sm3 = (sm1 * sm2).pruned(ref); // removes elements much smaller than ref @@ -314,7 +314,7 @@ sm2 = sm1.transpose() * P; \subsection TutorialSparse_SubMatrices Block operations Regarding read-access, sparse matrices expose the same API than for dense matrices to access to sub-matrices such as blocks, columns, and rows. See \ref TutorialBlockOperations for a detailed introduction. -However, for performance reasons, writing to a sub-sparse-matrix is much more limited, and currently only contiguous sets of columns (resp. rows) of a column-major (resp. row-major) SparseMatrix are writable. Moreover, this information has to be known at compile-time, leaving out methods such as block(...) and corner*(...). The available API for write-access to a SparseMatrix are summarized below: +However, for performance reasons, writing to a sub-sparse-matrix is much more limited, and currently only contiguous sets of columns (resp. rows) of a column-major (resp. row-major) SparseMatrix are writable. Moreover, this information has to be known at compile-time, leaving out methods such as `block(...)` and `corner*(...)`. The available API for write-access to a SparseMatrix are summarized below: \code SparseMatrix sm1; sm1.col(j) = ...; @@ -329,22 +329,22 @@ sm2.middleRows(i,nrows) = ...; sm2.bottomRows(nrows) = ...; \endcode -In addition, sparse matrices expose the SparseMatrixBase::innerVector() and SparseMatrixBase::innerVectors() methods, which are aliases to the col/middleCols methods for a column-major storage, and to the row/middleRows methods for a row-major storage. +In addition, sparse matrices expose the `SparseMatrixBase::innerVector()` and `SparseMatrixBase::innerVectors()` methods, which are aliases to the `col`/`middleCols` methods for a column-major storage, and to the `row`/`middleRows` methods for a row-major storage. \subsection TutorialSparse_TriangularSelfadjoint Triangular and selfadjoint views -Just as with dense matrices, the triangularView() function can be used to address a triangular part of the matrix, and perform triangular solves with a dense right hand side: +Just as with dense matrices, the `triangularView()` function can be used to address a triangular part of the matrix, and perform triangular solves with a dense right hand side: \code dm2 = sm1.triangularView(dm1); dv2 = sm1.transpose().triangularView(dv1); \endcode -The selfadjointView() function permits various operations: +The `selfadjointView()` function permits various operations: - optimized sparse-dense matrix products: \code -dm2 = sm1.selfadjointView<>() * dm1; // if all coefficients of A are stored -dm2 = A.selfadjointView() * dm1; // if only the upper part of A is stored -dm2 = A.selfadjointView() * dm1; // if only the lower part of A is stored +dm2 = sm1.selfadjointView<>() * dm1; // if all coefficients of sm1 are stored +dm2 = sm1.selfadjointView() * dm1; // if only the upper part of sm1 is stored +dm2 = sm1.selfadjointView() * dm1; // if only the lower part of sm1 is stored \endcode - copy of triangular parts: \code diff --git a/libs/eigen/doc/UsingBlasLapackBackends.dox b/libs/eigen/doc/UsingBlasLapackBackends.dox index caa5971..c700d85 100644 --- a/libs/eigen/doc/UsingBlasLapackBackends.dox +++ b/libs/eigen/doc/UsingBlasLapackBackends.dox @@ -101,11 +101,17 @@ m1.colPivHouseholderQr(); ?geqp3 \endcode +
OuterStarts:035810\em 12
Singular value decomposition \n \c EIGEN_USE_LAPACKE \code -JacobiSVD svd; -svd.compute(m1, ComputeThinV); +JacobiSVD svd; +svd.compute(m1); \endcode\code ?gesvd \endcode
Singular value decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT \code +BDCSVD svd; +svd.compute(m1); +\endcode\code +?gesdd +\endcode
Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT \code EigenSolver es(m1); ComplexEigenSolver ces(m1); diff --git a/libs/eigen/doc/eigen_navtree_hacks.js b/libs/eigen/doc/eigen_navtree_hacks.js index afb97ed..f36b332 100644 --- a/libs/eigen/doc/eigen_navtree_hacks.js +++ b/libs/eigen/doc/eigen_navtree_hacks.js @@ -62,23 +62,161 @@ function getNode(o, po) } } -// Overloaded to adjust the size of the navtree wrt the toc -function resizeHeight() -{ - var header = $("#top"); - var sidenav = $("#side-nav"); - var content = $("#doc-content"); - var navtree = $("#nav-tree"); - var footer = $("#nav-path"); - var toc = $("#nav-toc"); +/* + @licstart The following is the entire license notice for the JavaScript code in this file. - var headerHeight = header.outerHeight(); - var footerHeight = footer.outerHeight(); - var tocHeight = toc.height(); - var windowHeight = $(window).height() - headerHeight - footerHeight; - content.css({height:windowHeight + "px"}); - navtree.css({height:(windowHeight-tocHeight) + "px"}); - sidenav.css({height:windowHeight + "px"}); + The MIT License (MIT) + + Copyright (C) 1997-2020 by Dimitri van Heesch + + Permission is hereby granted, free of charge, to any person obtaining a copy of this software + and associated documentation files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, publish, distribute, + sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all copies or + substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + @licend The above is the entire license notice for the JavaScript code in this file + */ +// We need to override entire resizable just so we can change the height to account for the TOC. +function initResizable() +{ + var cookie_namespace = 'doxygen'; + var sidenav,navtree,content,header,collapsed,collapsedWidth=0,barWidth=6,desktop_vp=768,titleHeight; + + function readCookie(cookie) + { + var myCookie = cookie_namespace+"_"+cookie+"="; + if (document.cookie) { + var index = document.cookie.indexOf(myCookie); + if (index != -1) { + var valStart = index + myCookie.length; + var valEnd = document.cookie.indexOf(";", valStart); + if (valEnd == -1) { + valEnd = document.cookie.length; + } + var val = document.cookie.substring(valStart, valEnd); + return val; + } + } + return 0; + } + + function writeCookie(cookie, val, expiration) + { + if (val==undefined) return; + if (expiration == null) { + var date = new Date(); + date.setTime(date.getTime()+(10*365*24*60*60*1000)); // default expiration is one week + expiration = date.toGMTString(); + } + document.cookie = cookie_namespace + "_" + cookie + "=" + val + "; expires=" + expiration+"; path=/"; + } + + function resizeWidth() + { + var windowWidth = $(window).width() + "px"; + var sidenavWidth = $(sidenav).outerWidth(); + content.css({marginLeft:parseInt(sidenavWidth)+"px"}); + writeCookie('width',sidenavWidth-barWidth, null); + } + + function restoreWidth(navWidth) + { + var windowWidth = $(window).width() + "px"; + content.css({marginLeft:parseInt(navWidth)+barWidth+"px"}); + sidenav.css({width:navWidth + "px"}); + } + + function resizeHeight() + { + var headerHeight = header.outerHeight(); + var footerHeight = footer.outerHeight(); + var windowHeight = $(window).height() - headerHeight - footerHeight; + //========================================================================== + // MODIFICATION: + // This small section is the only portion modified within initResizable(). + // The rest is copy-pasted from the doxygen-generated resize.js. + // + // Adjust nav height to make room for TOC. + var toc = $("#nav-toc"); + var tocHeight = toc.height(); + var navHeight = windowHeight; + // tocHeight is not always defined (e.g. if empty) + if (tocHeight) { + navHeight = windowHeight - tocHeight; + } + //========================================================================== + + content.css({height:windowHeight + "px"}); + navtree.css({height:navHeight + "px"}); + sidenav.css({height:windowHeight + "px"}); + + var width=$(window).width(); + if (width!=collapsedWidth) { + if (width=desktop_vp) { + if (!collapsed) { + collapseExpand(); + } + } else if (width>desktop_vp && collapsedWidth0) { + restoreWidth(0); + collapsed=true; + } + else { + var width = readCookie('width'); + if (width>200 && width<$(window).width()) { restoreWidth(width); } else { restoreWidth(200); } + collapsed=false; + } + } + header = $("#top"); + sidenav = $("#side-nav"); + content = $("#doc-content"); + navtree = $("#nav-tree"); + footer = $("#nav-path"); + + $(".side-nav-resizable").resizable({resize: function(e, ui) { resizeWidth(); } }); + $(sidenav).resizable({ minWidth: 0 }); + $(window).resize(function() { resizeHeight(); }); + var device = navigator.userAgent.toLowerCase(); + var touch_device = device.match(/(iphone|ipod|ipad|android)/); + if (touch_device) { /* wider split bar for touch only devices */ + $(sidenav).css({ paddingRight:'20px' }); + $('.ui-resizable-e').css({ width:'20px' }); + $('#nav-sync').css({ right:'34px' }); + barWidth=20; + } + var width = readCookie('width'); + if (width) { restoreWidth(width); } else { resizeWidth(); } + resizeHeight(); + var url = location.href; + var i=url.indexOf("#"); + if (i>=0) window.location.hash=url.substr(i); + var _preventDefault = function(evt) { evt.preventDefault(); }; + $("#splitbar").bind("dragstart", _preventDefault).bind("selectstart", _preventDefault); + $(".ui-resizable-handle").dblclick(collapseExpand); + $(window).on('load',resizeHeight); } // Overloaded to save the root node into global_navtree_object @@ -241,7 +379,4 @@ $(document).ready(function() { setTimeout(arguments.callee, 10); } })(); - - $(window).on("load", resizeHeight); }); - diff --git a/libs/eigen/doc/eigendoxy.css b/libs/eigen/doc/eigendoxy.css index 4e9d7d1..c746194 100644 --- a/libs/eigen/doc/eigendoxy.css +++ b/libs/eigen/doc/eigendoxy.css @@ -160,9 +160,7 @@ div.toc { margin:0; padding: 0.3em 0 0 0; width:100%; - float:none; - position:absolute; - bottom:0; + float: none; border-radius:0px; border-style: solid none none none; max-height:50%; diff --git a/libs/eigen/doc/eigendoxy_header.html.in b/libs/eigen/doc/eigendoxy_header.html.in index a6b1c1d..53e95c3 100644 --- a/libs/eigen/doc/eigendoxy_header.html.in +++ b/libs/eigen/doc/eigendoxy_header.html.in @@ -21,8 +21,6 @@ $mathjax -
Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
-
diff --git a/libs/eigen/doc/examples/CMakeLists.txt b/libs/eigen/doc/examples/CMakeLists.txt index a2c9d05..dd49e3c 100644 --- a/libs/eigen/doc/examples/CMakeLists.txt +++ b/libs/eigen/doc/examples/CMakeLists.txt @@ -14,7 +14,3 @@ foreach(example_src ${examples_SRCS}) ) add_dependencies(all_examples ${example}) endforeach() - -if(EIGEN_COMPILER_SUPPORT_CPP11) -ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11") -endif() \ No newline at end of file diff --git a/libs/eigen/doc/examples/Cwise_erf.cpp b/libs/eigen/doc/examples/Cwise_erf.cpp index e7cd2c1..9ddc57d 100644 --- a/libs/eigen/doc/examples/Cwise_erf.cpp +++ b/libs/eigen/doc/examples/Cwise_erf.cpp @@ -1,9 +1,8 @@ #include #include #include -using namespace Eigen; int main() { - Array4d v(-0.5,2,0,-7); + Eigen::Array4d v(-0.5,2,0,-7); std::cout << v.erf() << std::endl; } diff --git a/libs/eigen/doc/examples/Cwise_erfc.cpp b/libs/eigen/doc/examples/Cwise_erfc.cpp index d8bb04c..4b7902c 100644 --- a/libs/eigen/doc/examples/Cwise_erfc.cpp +++ b/libs/eigen/doc/examples/Cwise_erfc.cpp @@ -1,9 +1,8 @@ #include #include #include -using namespace Eigen; int main() { - Array4d v(-0.5,2,0,-7); + Eigen::Array4d v(-0.5,2,0,-7); std::cout << v.erfc() << std::endl; } diff --git a/libs/eigen/doc/examples/Cwise_lgamma.cpp b/libs/eigen/doc/examples/Cwise_lgamma.cpp index 6bfaccb..f3c9fe6 100644 --- a/libs/eigen/doc/examples/Cwise_lgamma.cpp +++ b/libs/eigen/doc/examples/Cwise_lgamma.cpp @@ -1,9 +1,8 @@ #include #include #include -using namespace Eigen; int main() { - Array4d v(0.5,10,0,-1); + Eigen::Array4d v(0.5,10,0,-1); std::cout << v.lgamma() << std::endl; } diff --git a/libs/eigen/doc/examples/DenseBase_middleCols_int.cpp b/libs/eigen/doc/examples/DenseBase_middleCols_int.cpp index 0ebd955..d05a552 100644 --- a/libs/eigen/doc/examples/DenseBase_middleCols_int.cpp +++ b/libs/eigen/doc/examples/DenseBase_middleCols_int.cpp @@ -1,15 +1,12 @@ #include #include -using namespace Eigen; -using namespace std; - -int main(void) +int main() { int const N = 5; - MatrixXi A(N,N); + Eigen::MatrixXi A(N,N); A.setRandom(); - cout << "A =\n" << A << '\n' << endl; - cout << "A(1..3,:) =\n" << A.middleCols(1,3) << endl; + std::cout << "A =\n" << A << '\n' << std::endl; + std::cout << "A(1..3,:) =\n" << A.middleCols(1,3) << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/DenseBase_middleRows_int.cpp b/libs/eigen/doc/examples/DenseBase_middleRows_int.cpp index a6fe9e8..8651629 100644 --- a/libs/eigen/doc/examples/DenseBase_middleRows_int.cpp +++ b/libs/eigen/doc/examples/DenseBase_middleRows_int.cpp @@ -1,15 +1,12 @@ #include #include -using namespace Eigen; -using namespace std; - -int main(void) +int main() { int const N = 5; - MatrixXi A(N,N); + Eigen::MatrixXi A(N,N); A.setRandom(); - cout << "A =\n" << A << '\n' << endl; - cout << "A(2..3,:) =\n" << A.middleRows(2,2) << endl; + std::cout << "A =\n" << A << '\n' << std::endl; + std::cout << "A(2..3,:) =\n" << A.middleRows(2,2) << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/DenseBase_template_int_middleCols.cpp b/libs/eigen/doc/examples/DenseBase_template_int_middleCols.cpp index 6191d79..caefabf 100644 --- a/libs/eigen/doc/examples/DenseBase_template_int_middleCols.cpp +++ b/libs/eigen/doc/examples/DenseBase_template_int_middleCols.cpp @@ -1,15 +1,12 @@ #include #include -using namespace Eigen; -using namespace std; - -int main(void) +int main() { int const N = 5; - MatrixXi A(N,N); + Eigen::MatrixXi A(N,N); A.setRandom(); - cout << "A =\n" << A << '\n' << endl; - cout << "A(:,1..3) =\n" << A.middleCols<3>(1) << endl; + std::cout << "A =\n" << A << '\n' << std::endl; + std::cout << "A(:,1..3) =\n" << A.middleCols<3>(1) << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/DenseBase_template_int_middleRows.cpp b/libs/eigen/doc/examples/DenseBase_template_int_middleRows.cpp index 7e8b657..ed5b295 100644 --- a/libs/eigen/doc/examples/DenseBase_template_int_middleRows.cpp +++ b/libs/eigen/doc/examples/DenseBase_template_int_middleRows.cpp @@ -1,15 +1,12 @@ #include #include -using namespace Eigen; -using namespace std; - -int main(void) +int main() { int const N = 5; - MatrixXi A(N,N); + Eigen::MatrixXi A(N,N); A.setRandom(); - cout << "A =\n" << A << '\n' << endl; - cout << "A(1..3,:) =\n" << A.middleRows<3>(1) << endl; + std::cout << "A =\n" << A << '\n' << std::endl; + std::cout << "A(1..3,:) =\n" << A.middleRows<3>(1) << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/QuickStart_example2_dynamic.cpp b/libs/eigen/doc/examples/QuickStart_example2_dynamic.cpp index ff6746e..bc8d326 100644 --- a/libs/eigen/doc/examples/QuickStart_example2_dynamic.cpp +++ b/libs/eigen/doc/examples/QuickStart_example2_dynamic.cpp @@ -1,15 +1,15 @@ #include #include -using namespace Eigen; -using namespace std; +using Eigen::MatrixXd; +using Eigen::VectorXd; int main() { MatrixXd m = MatrixXd::Random(3,3); m = (m + MatrixXd::Constant(3,3,1.2)) * 50; - cout << "m =" << endl << m << endl; + std::cout << "m =" << std::endl << m << std::endl; VectorXd v(3); v << 1, 2, 3; - cout << "m * v =" << endl << m * v << endl; + std::cout << "m * v =" << std::endl << m * v << std::endl; } diff --git a/libs/eigen/doc/examples/QuickStart_example2_fixed.cpp b/libs/eigen/doc/examples/QuickStart_example2_fixed.cpp index d911752..af6f9a9 100644 --- a/libs/eigen/doc/examples/QuickStart_example2_fixed.cpp +++ b/libs/eigen/doc/examples/QuickStart_example2_fixed.cpp @@ -1,15 +1,15 @@ #include #include -using namespace Eigen; -using namespace std; +using Eigen::Matrix3d; +using Eigen::Vector3d; int main() { Matrix3d m = Matrix3d::Random(); m = (m + Matrix3d::Constant(1.2)) * 50; - cout << "m =" << endl << m << endl; + std::cout << "m =" << std::endl << m << std::endl; Vector3d v(1,2,3); - cout << "m * v =" << endl << m * v << endl; + std::cout << "m * v =" << std::endl << m * v << std::endl; } diff --git a/libs/eigen/doc/examples/TemplateKeyword_flexible.cpp b/libs/eigen/doc/examples/TemplateKeyword_flexible.cpp index 9d85292..efe458b 100644 --- a/libs/eigen/doc/examples/TemplateKeyword_flexible.cpp +++ b/libs/eigen/doc/examples/TemplateKeyword_flexible.cpp @@ -1,19 +1,17 @@ #include #include -using namespace Eigen; - template -void copyUpperTriangularPart(MatrixBase& dst, const MatrixBase& src) +void copyUpperTriangularPart(Eigen::MatrixBase& dst, const Eigen::MatrixBase& src) { /* Note the 'template' keywords in the following line! */ - dst.template triangularView() = src.template triangularView(); + dst.template triangularView() = src.template triangularView(); } int main() { - MatrixXi m1 = MatrixXi::Ones(5,5); - MatrixXi m2 = MatrixXi::Random(4,4); + Eigen::MatrixXi m1 = Eigen::MatrixXi::Ones(5,5); + Eigen::MatrixXi m2 = Eigen::MatrixXi::Random(4,4); std::cout << "m2 before copy:" << std::endl; std::cout << m2 << std::endl << std::endl; copyUpperTriangularPart(m2, m1.topLeftCorner(4,4)); diff --git a/libs/eigen/doc/examples/TemplateKeyword_simple.cpp b/libs/eigen/doc/examples/TemplateKeyword_simple.cpp index 6998c17..6b946ad 100644 --- a/libs/eigen/doc/examples/TemplateKeyword_simple.cpp +++ b/libs/eigen/doc/examples/TemplateKeyword_simple.cpp @@ -1,11 +1,11 @@ #include #include -using namespace Eigen; +using Eigen::MatrixXf; void copyUpperTriangularPart(MatrixXf& dst, const MatrixXf& src) { - dst.triangularView() = src.triangularView(); + dst.triangularView() = src.triangularView(); } int main() diff --git a/libs/eigen/doc/examples/TutorialInplaceLU.cpp b/libs/eigen/doc/examples/TutorialInplaceLU.cpp index cb9c59b..72bead2 100644 --- a/libs/eigen/doc/examples/TutorialInplaceLU.cpp +++ b/libs/eigen/doc/examples/TutorialInplaceLU.cpp @@ -1,61 +1,57 @@ #include struct init { - init() { std::cout << "[" << "init" << "]" << std::endl; } + init() { std::cout << "[init]\n"; } }; init init_obj; // [init] -#include #include -using namespace std; -using namespace Eigen; - int main() { - MatrixXd A(2,2); + Eigen::MatrixXd A(2,2); A << 2, -1, 1, 3; - cout << "Here is the input matrix A before decomposition:\n" << A << endl; -cout << "[init]" << endl; + std::cout << "Here is the input matrix A before decomposition:\n" << A << "\n"; + std::cout << "[init]\n"; -cout << "[declaration]" << endl; - PartialPivLU > lu(A); - cout << "Here is the input matrix A after decomposition:\n" << A << endl; -cout << "[declaration]" << endl; + std::cout << "[declaration]\n"; + Eigen::PartialPivLU > lu(A); + std::cout << "Here is the input matrix A after decomposition:\n" << A << "\n"; + std::cout << "[declaration]\n"; -cout << "[matrixLU]" << endl; - cout << "Here is the matrix storing the L and U factors:\n" << lu.matrixLU() << endl; -cout << "[matrixLU]" << endl; + std::cout << "[matrixLU]\n"; + std::cout << "Here is the matrix storing the L and U factors:\n" << lu.matrixLU() << "\n"; + std::cout << "[matrixLU]\n"; -cout << "[solve]" << endl; - MatrixXd A0(2,2); A0 << 2, -1, 1, 3; - VectorXd b(2); b << 1, 2; - VectorXd x = lu.solve(b); - cout << "Residual: " << (A0 * x - b).norm() << endl; -cout << "[solve]" << endl; + std::cout << "[solve]\n"; + Eigen::MatrixXd A0(2,2); A0 << 2, -1, 1, 3; + Eigen::VectorXd b(2); b << 1, 2; + Eigen::VectorXd x = lu.solve(b); + std::cout << "Residual: " << (A0 * x - b).norm() << "\n"; + std::cout << "[solve]\n"; -cout << "[modifyA]" << endl; + std::cout << "[modifyA]\n"; A << 3, 4, -2, 1; x = lu.solve(b); - cout << "Residual: " << (A0 * x - b).norm() << endl; -cout << "[modifyA]" << endl; + std::cout << "Residual: " << (A0 * x - b).norm() << "\n"; + std::cout << "[modifyA]\n"; -cout << "[recompute]" << endl; + std::cout << "[recompute]\n"; A0 = A; // save A lu.compute(A); x = lu.solve(b); - cout << "Residual: " << (A0 * x - b).norm() << endl; -cout << "[recompute]" << endl; + std::cout << "Residual: " << (A0 * x - b).norm() << "\n"; + std::cout << "[recompute]\n"; -cout << "[recompute_bis0]" << endl; - MatrixXd A1(2,2); + std::cout << "[recompute_bis0]\n"; + Eigen::MatrixXd A1(2,2); A1 << 5,-2,3,4; lu.compute(A1); - cout << "Here is the input matrix A1 after decomposition:\n" << A1 << endl; -cout << "[recompute_bis0]" << endl; + std::cout << "Here is the input matrix A1 after decomposition:\n" << A1 << "\n"; + std::cout << "[recompute_bis0]\n"; -cout << "[recompute_bis1]" << endl; + std::cout << "[recompute_bis1]\n"; x = lu.solve(b); - cout << "Residual: " << (A1 * x - b).norm() << endl; -cout << "[recompute_bis1]" << endl; + std::cout << "Residual: " << (A1 * x - b).norm() << "\n"; + std::cout << "[recompute_bis1]\n"; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgComputeTwice.cpp b/libs/eigen/doc/examples/TutorialLinAlgComputeTwice.cpp index 06ba646..a561f08 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgComputeTwice.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgComputeTwice.cpp @@ -1,23 +1,20 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix2f A, b; - LLT llt; + Eigen::Matrix2f A, b; + Eigen::LLT llt; A << 2, -1, -1, 3; b << 1, 2, 3, 1; - cout << "Here is the matrix A:\n" << A << endl; - cout << "Here is the right hand side b:\n" << b << endl; - cout << "Computing LLT decomposition..." << endl; + std::cout << "Here is the matrix A:\n" << A << std::endl; + std::cout << "Here is the right hand side b:\n" << b << std::endl; + std::cout << "Computing LLT decomposition..." << std::endl; llt.compute(A); - cout << "The solution is:\n" << llt.solve(b) << endl; + std::cout << "The solution is:\n" << llt.solve(b) << std::endl; A(1,1)++; - cout << "The matrix A is now:\n" << A << endl; - cout << "Computing LLT decomposition..." << endl; + std::cout << "The matrix A is now:\n" << A << std::endl; + std::cout << "Computing LLT decomposition..." << std::endl; llt.compute(A); - cout << "The solution is now:\n" << llt.solve(b) << endl; + std::cout << "The solution is now:\n" << llt.solve(b) << std::endl; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgExComputeSolveError.cpp b/libs/eigen/doc/examples/TutorialLinAlgExComputeSolveError.cpp index f362fb7..199f3f5 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgExComputeSolveError.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgExComputeSolveError.cpp @@ -1,8 +1,7 @@ #include #include -using namespace std; -using namespace Eigen; +using Eigen::MatrixXd; int main() { @@ -10,5 +9,5 @@ int main() MatrixXd b = MatrixXd::Random(100,50); MatrixXd x = A.fullPivLu().solve(b); double relative_error = (A*x - b).norm() / b.norm(); // norm() is L2 norm - cout << "The relative error is:\n" << relative_error << endl; + std::cout << "The relative error is:\n" << relative_error << std::endl; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgExSolveColPivHouseholderQR.cpp b/libs/eigen/doc/examples/TutorialLinAlgExSolveColPivHouseholderQR.cpp index 3a99a94..5ee6b6a 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgExSolveColPivHouseholderQR.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgExSolveColPivHouseholderQR.cpp @@ -1,17 +1,14 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix3f A; - Vector3f b; + Eigen::Matrix3f A; + Eigen::Vector3f b; A << 1,2,3, 4,5,6, 7,8,10; b << 3, 3, 4; - cout << "Here is the matrix A:\n" << A << endl; - cout << "Here is the vector b:\n" << b << endl; - Vector3f x = A.colPivHouseholderQr().solve(b); - cout << "The solution is:\n" << x << endl; + std::cout << "Here is the matrix A:\n" << A << std::endl; + std::cout << "Here is the vector b:\n" << b << std::endl; + Eigen::Vector3f x = A.colPivHouseholderQr().solve(b); + std::cout << "The solution is:\n" << x << std::endl; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgExSolveLDLT.cpp b/libs/eigen/doc/examples/TutorialLinAlgExSolveLDLT.cpp index f8beacd..82186d4 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgExSolveLDLT.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgExSolveLDLT.cpp @@ -1,16 +1,13 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix2f A, b; + Eigen::Matrix2f A, b; A << 2, -1, -1, 3; b << 1, 2, 3, 1; - cout << "Here is the matrix A:\n" << A << endl; - cout << "Here is the right hand side b:\n" << b << endl; - Matrix2f x = A.ldlt().solve(b); - cout << "The solution is:\n" << x << endl; + std::cout << "Here is the matrix A:\n" << A << std::endl; + std::cout << "Here is the right hand side b:\n" << b << std::endl; + Eigen::Matrix2f x = A.ldlt().solve(b); + std::cout << "The solution is:\n" << x << std::endl; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgInverseDeterminant.cpp b/libs/eigen/doc/examples/TutorialLinAlgInverseDeterminant.cpp index 14dde5b..b31a92a 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgInverseDeterminant.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgInverseDeterminant.cpp @@ -1,16 +1,13 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix3f A; + Eigen::Matrix3f A; A << 1, 2, 1, 2, 1, 0, -1, 1, 2; - cout << "Here is the matrix A:\n" << A << endl; - cout << "The determinant of A is " << A.determinant() << endl; - cout << "The inverse of A is:\n" << A.inverse() << endl; + std::cout << "Here is the matrix A:\n" << A << std::endl; + std::cout << "The determinant of A is " << A.determinant() << std::endl; + std::cout << "The inverse of A is:\n" << A.inverse() << std::endl; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgRankRevealing.cpp b/libs/eigen/doc/examples/TutorialLinAlgRankRevealing.cpp index c516507..fea52ab 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgRankRevealing.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgRankRevealing.cpp @@ -1,20 +1,17 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix3f A; + Eigen::Matrix3f A; A << 1, 2, 5, 2, 1, 4, 3, 0, 3; - cout << "Here is the matrix A:\n" << A << endl; - FullPivLU lu_decomp(A); - cout << "The rank of A is " << lu_decomp.rank() << endl; - cout << "Here is a matrix whose columns form a basis of the null-space of A:\n" - << lu_decomp.kernel() << endl; - cout << "Here is a matrix whose columns form a basis of the column-space of A:\n" - << lu_decomp.image(A) << endl; // yes, have to pass the original A + std::cout << "Here is the matrix A:\n" << A << std::endl; + Eigen::FullPivLU lu_decomp(A); + std::cout << "The rank of A is " << lu_decomp.rank() << std::endl; + std::cout << "Here is a matrix whose columns form a basis of the null-space of A:\n" + << lu_decomp.kernel() << std::endl; + std::cout << "Here is a matrix whose columns form a basis of the column-space of A:\n" + << lu_decomp.image(A) << std::endl; // yes, have to pass the original A } diff --git a/libs/eigen/doc/examples/TutorialLinAlgSVDSolve.cpp b/libs/eigen/doc/examples/TutorialLinAlgSVDSolve.cpp index f109f04..04cbe1b 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgSVDSolve.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgSVDSolve.cpp @@ -1,15 +1,12 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - MatrixXf A = MatrixXf::Random(3, 2); - cout << "Here is the matrix A:\n" << A << endl; - VectorXf b = VectorXf::Random(3); - cout << "Here is the right hand side b:\n" << b << endl; - cout << "The least-squares solution is:\n" - << A.bdcSvd(ComputeThinU | ComputeThinV).solve(b) << endl; + Eigen::MatrixXf A = Eigen::MatrixXf::Random(3, 2); + std::cout << "Here is the matrix A:\n" << A << std::endl; + Eigen::VectorXf b = Eigen::VectorXf::Random(3); + std::cout << "Here is the right hand side b:\n" << b << std::endl; + std::cout << "The least-squares solution is:\n" + << A.template bdcSvd().solve(b) << std::endl; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgSelfAdjointEigenSolver.cpp b/libs/eigen/doc/examples/TutorialLinAlgSelfAdjointEigenSolver.cpp index 8d1d1ed..fcf2f33 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgSelfAdjointEigenSolver.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgSelfAdjointEigenSolver.cpp @@ -1,18 +1,15 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix2f A; + Eigen::Matrix2f A; A << 1, 2, 2, 3; - cout << "Here is the matrix A:\n" << A << endl; - SelfAdjointEigenSolver eigensolver(A); - if (eigensolver.info() != Success) abort(); - cout << "The eigenvalues of A are:\n" << eigensolver.eigenvalues() << endl; - cout << "Here's a matrix whose columns are eigenvectors of A \n" + std::cout << "Here is the matrix A:\n" << A << std::endl; + Eigen::SelfAdjointEigenSolver eigensolver(A); + if (eigensolver.info() != Eigen::Success) abort(); + std::cout << "The eigenvalues of A are:\n" << eigensolver.eigenvalues() << std::endl; + std::cout << "Here's a matrix whose columns are eigenvectors of A \n" << "corresponding to these eigenvalues:\n" - << eigensolver.eigenvectors() << endl; + << eigensolver.eigenvectors() << std::endl; } diff --git a/libs/eigen/doc/examples/TutorialLinAlgSetThreshold.cpp b/libs/eigen/doc/examples/TutorialLinAlgSetThreshold.cpp index 3956b13..e1335e7 100644 --- a/libs/eigen/doc/examples/TutorialLinAlgSetThreshold.cpp +++ b/libs/eigen/doc/examples/TutorialLinAlgSetThreshold.cpp @@ -1,16 +1,13 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix2d A; + Eigen::Matrix2d A; A << 2, 1, 2, 0.9999999999; - FullPivLU lu(A); - cout << "By default, the rank of A is found to be " << lu.rank() << endl; + Eigen::FullPivLU lu(A); + std::cout << "By default, the rank of A is found to be " << lu.rank() << std::endl; lu.setThreshold(1e-5); - cout << "With threshold 1e-5, the rank of A is found to be " << lu.rank() << endl; + std::cout << "With threshold 1e-5, the rank of A is found to be " << lu.rank() << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ArrayClass_accessors.cpp b/libs/eigen/doc/examples/Tutorial_ArrayClass_accessors.cpp index dc720ff..0db52a3 100644 --- a/libs/eigen/doc/examples/Tutorial_ArrayClass_accessors.cpp +++ b/libs/eigen/doc/examples/Tutorial_ArrayClass_accessors.cpp @@ -1,24 +1,21 @@ #include #include -using namespace Eigen; -using namespace std; - int main() { - ArrayXXf m(2,2); + Eigen::ArrayXXf m(2,2); // assign some values coefficient by coefficient m(0,0) = 1.0; m(0,1) = 2.0; m(1,0) = 3.0; m(1,1) = m(0,1) + m(1,0); // print values to standard output - cout << m << endl << endl; + std::cout << m << std::endl << std::endl; // using the comma-initializer is also allowed m << 1.0,2.0, 3.0,4.0; // print values to standard output - cout << m << endl; + std::cout << m << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ArrayClass_addition.cpp b/libs/eigen/doc/examples/Tutorial_ArrayClass_addition.cpp index 480ffb0..4a407a7 100644 --- a/libs/eigen/doc/examples/Tutorial_ArrayClass_addition.cpp +++ b/libs/eigen/doc/examples/Tutorial_ArrayClass_addition.cpp @@ -1,13 +1,10 @@ #include #include -using namespace Eigen; -using namespace std; - int main() { - ArrayXXf a(3,3); - ArrayXXf b(3,3); + Eigen::ArrayXXf a(3,3); + Eigen::ArrayXXf b(3,3); a << 1,2,3, 4,5,6, 7,8,9; @@ -16,8 +13,8 @@ int main() 1,2,3; // Adding two arrays - cout << "a + b = " << endl << a + b << endl << endl; + std::cout << "a + b = " << std::endl << a + b << std::endl << std::endl; // Subtracting a scalar from an array - cout << "a - 2 = " << endl << a - 2 << endl; + std::cout << "a - 2 = " << std::endl << a - 2 << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ArrayClass_cwise_other.cpp b/libs/eigen/doc/examples/Tutorial_ArrayClass_cwise_other.cpp index d9046c6..12483f3 100644 --- a/libs/eigen/doc/examples/Tutorial_ArrayClass_cwise_other.cpp +++ b/libs/eigen/doc/examples/Tutorial_ArrayClass_cwise_other.cpp @@ -1,19 +1,16 @@ #include #include -using namespace Eigen; -using namespace std; - int main() { - ArrayXf a = ArrayXf::Random(5); + Eigen::ArrayXf a = Eigen::ArrayXf::Random(5); a *= 2; - cout << "a =" << endl - << a << endl; - cout << "a.abs() =" << endl - << a.abs() << endl; - cout << "a.abs().sqrt() =" << endl - << a.abs().sqrt() << endl; - cout << "a.min(a.abs().sqrt()) =" << endl - << a.min(a.abs().sqrt()) << endl; + std::cout << "a =" << std::endl + << a << std::endl; + std::cout << "a.abs() =" << std::endl + << a.abs() << std::endl; + std::cout << "a.abs().sqrt() =" << std::endl + << a.abs().sqrt() << std::endl; + std::cout << "a.min(a.abs().sqrt()) =" << std::endl + << a.min(a.abs().sqrt()) << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ArrayClass_interop.cpp b/libs/eigen/doc/examples/Tutorial_ArrayClass_interop.cpp index 371f070..c9a8352 100644 --- a/libs/eigen/doc/examples/Tutorial_ArrayClass_interop.cpp +++ b/libs/eigen/doc/examples/Tutorial_ArrayClass_interop.cpp @@ -1,8 +1,7 @@ #include #include -using namespace Eigen; -using namespace std; +using Eigen::MatrixXf; int main() { @@ -16,7 +15,7 @@ int main() 7,8; result = (m.array() + 4).matrix() * m; - cout << "-- Combination 1: --" << endl << result << endl << endl; + std::cout << "-- Combination 1: --\n" << result << "\n\n"; result = (m.array() * n.array()).matrix() * m; - cout << "-- Combination 2: --" << endl << result << endl << endl; + std::cout << "-- Combination 2: --\n" << result << "\n\n"; } diff --git a/libs/eigen/doc/examples/Tutorial_ArrayClass_interop_matrix.cpp b/libs/eigen/doc/examples/Tutorial_ArrayClass_interop_matrix.cpp index 1014275..07ec9b0 100644 --- a/libs/eigen/doc/examples/Tutorial_ArrayClass_interop_matrix.cpp +++ b/libs/eigen/doc/examples/Tutorial_ArrayClass_interop_matrix.cpp @@ -1,8 +1,7 @@ #include #include -using namespace Eigen; -using namespace std; +using Eigen::MatrixXf; int main() { @@ -16,11 +15,11 @@ int main() 7,8; result = m * n; - cout << "-- Matrix m*n: --" << endl << result << endl << endl; + std::cout << "-- Matrix m*n: --\n" << result << "\n\n"; result = m.array() * n.array(); - cout << "-- Array m*n: --" << endl << result << endl << endl; + std::cout << "-- Array m*n: --\n" << result << "\n\n"; result = m.cwiseProduct(n); - cout << "-- With cwiseProduct: --" << endl << result << endl << endl; + std::cout << "-- With cwiseProduct: --\n" << result << "\n\n"; result = m.array() + 4; - cout << "-- Array m + 4: --" << endl << result << endl << endl; + std::cout << "-- Array m + 4: --\n" << result << "\n\n"; } diff --git a/libs/eigen/doc/examples/Tutorial_ArrayClass_mult.cpp b/libs/eigen/doc/examples/Tutorial_ArrayClass_mult.cpp index 6cb439f..bada36c 100644 --- a/libs/eigen/doc/examples/Tutorial_ArrayClass_mult.cpp +++ b/libs/eigen/doc/examples/Tutorial_ArrayClass_mult.cpp @@ -1,16 +1,13 @@ #include #include -using namespace Eigen; -using namespace std; - int main() { - ArrayXXf a(2,2); - ArrayXXf b(2,2); + Eigen::ArrayXXf a(2,2); + Eigen::ArrayXXf b(2,2); a << 1,2, 3,4; b << 5,6, 7,8; - cout << "a * b = " << endl << a * b << endl; + std::cout << "a * b = " << std::endl << a * b << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp b/libs/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp index 0b87313..26ad478 100644 --- a/libs/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp +++ b/libs/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp @@ -1,18 +1,15 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Array22f m; + Eigen::Array22f m; m << 1,2, 3,4; - Array44f a = Array44f::Constant(0.6); - cout << "Here is the array a:" << endl << a << endl << endl; + Eigen::Array44f a = Eigen::Array44f::Constant(0.6); + std::cout << "Here is the array a:\n" << a << "\n\n"; a.block<2,2>(1,1) = m; - cout << "Here is now a with m copied into its central 2x2 block:" << endl << a << endl << endl; + std::cout << "Here is now a with m copied into its central 2x2 block:\n" << a << "\n\n"; a.block(0,0,2,3) = a.block(2,1,2,3); - cout << "Here is now a with bottom-right 2x3 block copied into top-left 2x3 block:" << endl << a << endl << endl; + std::cout << "Here is now a with bottom-right 2x3 block copied into top-left 2x3 block:\n" << a << "\n\n"; } diff --git a/libs/eigen/doc/examples/Tutorial_PartialLU_solve.cpp b/libs/eigen/doc/examples/Tutorial_PartialLU_solve.cpp index a560879..ca72c99 100644 --- a/libs/eigen/doc/examples/Tutorial_PartialLU_solve.cpp +++ b/libs/eigen/doc/examples/Tutorial_PartialLU_solve.cpp @@ -2,17 +2,14 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - Matrix3f A; - Vector3f b; + Eigen::Matrix3f A; + Eigen::Vector3f b; A << 1,2,3, 4,5,6, 7,8,10; b << 3, 3, 4; - cout << "Here is the matrix A:" << endl << A << endl; - cout << "Here is the vector b:" << endl << b << endl; - Vector3f x = A.lu().solve(b); - cout << "The solution is:" << endl << x << endl; + std::cout << "Here is the matrix A:" << std::endl << A << std::endl; + std::cout << "Here is the vector b:" << std::endl << b << std::endl; + Eigen::Vector3f x = A.lu().solve(b); + std::cout << "The solution is:" << std::endl << x << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp index 334b4d8..8ef06be 100644 --- a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp +++ b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp @@ -1,9 +1,6 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { Eigen::MatrixXf m(2,4); @@ -15,10 +12,10 @@ int main() v << 2, 3; - MatrixXf::Index index; + Eigen::Index index; // find nearest neighbour (m.colwise() - v).colwise().squaredNorm().minCoeff(&index); - cout << "Nearest neighbour is column " << index << ":" << endl; - cout << m.col(index) << endl; + std::cout << "Nearest neighbour is column " << index << ":" << std::endl; + std::cout << m.col(index) << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp index 049c747..b5d88c3 100644 --- a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp +++ b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp @@ -1,15 +1,13 @@ #include #include -using namespace std; -using namespace Eigen; int main() { - MatrixXf mat(2,4); + Eigen::MatrixXf mat(2,4); mat << 1, 2, 6, 9, 3, 1, 7, 2; - MatrixXf::Index maxIndex; + Eigen::Index maxIndex; float maxNorm = mat.colwise().sum().maxCoeff(&maxIndex); std::cout << "Maximum sum at position " << maxIndex << std::endl; diff --git a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp index 0cca37f..7b89bcf 100644 --- a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp +++ b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp @@ -1,21 +1,18 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - ArrayXXf a(2,2); + Eigen::ArrayXXf a(2,2); a << 1,2, 3,4; - cout << "(a > 0).all() = " << (a > 0).all() << endl; - cout << "(a > 0).any() = " << (a > 0).any() << endl; - cout << "(a > 0).count() = " << (a > 0).count() << endl; - cout << endl; - cout << "(a > 2).all() = " << (a > 2).all() << endl; - cout << "(a > 2).any() = " << (a > 2).any() << endl; - cout << "(a > 2).count() = " << (a > 2).count() << endl; + std::cout << "(a > 0).all() = " << (a > 0).all() << std::endl; + std::cout << "(a > 0).any() = " << (a > 0).any() << std::endl; + std::cout << "(a > 0).count() = " << (a > 0).count() << std::endl; + std::cout << std::endl; + std::cout << "(a > 2).all() = " << (a > 2).all() << std::endl; + std::cout << "(a > 2).any() = " << (a > 2).any() << std::endl; + std::cout << "(a > 2).count() = " << (a > 2).count() << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp index 740439f..7519137 100644 --- a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp +++ b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp @@ -1,13 +1,10 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { - VectorXf v(2); - MatrixXf m(2,2), n(2,2); + Eigen::VectorXf v(2); + Eigen::MatrixXf m(2,2), n(2,2); v << -1, 2; @@ -15,14 +12,14 @@ int main() m << 1,-2, -3,4; - cout << "v.squaredNorm() = " << v.squaredNorm() << endl; - cout << "v.norm() = " << v.norm() << endl; - cout << "v.lpNorm<1>() = " << v.lpNorm<1>() << endl; - cout << "v.lpNorm() = " << v.lpNorm() << endl; + std::cout << "v.squaredNorm() = " << v.squaredNorm() << std::endl; + std::cout << "v.norm() = " << v.norm() << std::endl; + std::cout << "v.lpNorm<1>() = " << v.lpNorm<1>() << std::endl; + std::cout << "v.lpNorm() = " << v.lpNorm() << std::endl; - cout << endl; - cout << "m.squaredNorm() = " << m.squaredNorm() << endl; - cout << "m.norm() = " << m.norm() << endl; - cout << "m.lpNorm<1>() = " << m.lpNorm<1>() << endl; - cout << "m.lpNorm() = " << m.lpNorm() << endl; + std::cout << std::endl; + std::cout << "m.squaredNorm() = " << m.squaredNorm() << std::endl; + std::cout << "m.norm() = " << m.norm() << std::endl; + std::cout << "m.lpNorm<1>() = " << m.lpNorm<1>() << std::endl; + std::cout << "m.lpNorm() = " << m.lpNorm() << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp index 62e28fc..8faa5a1 100644 --- a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp +++ b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp @@ -1,18 +1,15 @@ #include #include -using namespace Eigen; -using namespace std; - int main() { - MatrixXf m(2,2); + Eigen::MatrixXf m(2,2); m << 1,-2, -3,4; - cout << "1-norm(m) = " << m.cwiseAbs().colwise().sum().maxCoeff() - << " == " << m.colwise().lpNorm<1>().maxCoeff() << endl; + std::cout << "1-norm(m) = " << m.cwiseAbs().colwise().sum().maxCoeff() + << " == " << m.colwise().lpNorm<1>().maxCoeff() << std::endl; - cout << "infty-norm(m) = " << m.cwiseAbs().rowwise().sum().maxCoeff() - << " == " << m.rowwise().lpNorm<1>().maxCoeff() << endl; + std::cout << "infty-norm(m) = " << m.cwiseAbs().rowwise().sum().maxCoeff() + << " == " << m.rowwise().lpNorm<1>().maxCoeff() << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp index b54e9aa..bd294bd 100644 --- a/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp +++ b/libs/eigen/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp @@ -1,9 +1,6 @@ #include #include -using namespace std; -using namespace Eigen; - int main() { Eigen::MatrixXf m(2,2); @@ -12,15 +9,15 @@ int main() 3, 4; //get location of maximum - MatrixXf::Index maxRow, maxCol; + Eigen::Index maxRow, maxCol; float max = m.maxCoeff(&maxRow, &maxCol); //get location of minimum - MatrixXf::Index minRow, minCol; + Eigen::Index minRow, minCol; float min = m.minCoeff(&minRow, &minCol); - cout << "Max: " << max << ", at: " << - maxRow << "," << maxCol << endl; - cout << "Min: " << min << ", at: " << - minRow << "," << minCol << endl; + std::cout << "Max: " << max << ", at: " << + maxRow << "," << maxCol << std::endl; + std:: cout << "Min: " << min << ", at: " << + minRow << "," << minCol << std::endl; } diff --git a/libs/eigen/doc/examples/Tutorial_simple_example_dynamic_size.cpp b/libs/eigen/doc/examples/Tutorial_simple_example_dynamic_size.cpp index defcb1e..796bd87 100644 --- a/libs/eigen/doc/examples/Tutorial_simple_example_dynamic_size.cpp +++ b/libs/eigen/doc/examples/Tutorial_simple_example_dynamic_size.cpp @@ -1,13 +1,11 @@ #include #include -using namespace Eigen; - int main() { for (int size=1; size<=4; ++size) { - MatrixXi m(size,size+1); // a (size)x(size+1)-matrix of int's + Eigen::MatrixXi m(size,size+1); // a (size)x(size+1)-matrix of int's for (int j=0; j #include -using namespace Eigen; - int main() { - Matrix3f m3; + Eigen::Matrix3f m3; m3 << 1, 2, 3, 4, 5, 6, 7, 8, 9; - Matrix4f m4 = Matrix4f::Identity(); - Vector4i v4(1, 2, 3, 4); + Eigen::Matrix4f m4 = Eigen::Matrix4f::Identity(); + Eigen::Vector4i v4(1, 2, 3, 4); std::cout << "m3\n" << m3 << "\nm4:\n" << m4 << "\nv4:\n" << v4 << std::endl; diff --git a/libs/eigen/doc/examples/class_Block.cpp b/libs/eigen/doc/examples/class_Block.cpp index ace719a..9ace0da 100644 --- a/libs/eigen/doc/examples/class_Block.cpp +++ b/libs/eigen/doc/examples/class_Block.cpp @@ -1,27 +1,25 @@ #include #include -using namespace Eigen; -using namespace std; template Eigen::Block -topLeftCorner(MatrixBase& m, int rows, int cols) +topLeftCorner(Eigen::MatrixBase& m, int rows, int cols) { return Eigen::Block(m.derived(), 0, 0, rows, cols); } template const Eigen::Block -topLeftCorner(const MatrixBase& m, int rows, int cols) +topLeftCorner(const Eigen::MatrixBase& m, int rows, int cols) { return Eigen::Block(m.derived(), 0, 0, rows, cols); } int main(int, char**) { - Matrix4d m = Matrix4d::Identity(); - cout << topLeftCorner(4*m, 2, 3) << endl; // calls the const version + Eigen::Matrix4d m = Eigen::Matrix4d::Identity(); + std::cout << topLeftCorner(4*m, 2, 3) << std::endl; // calls the const version topLeftCorner(m, 2, 3) *= 5; // calls the non-const version - cout << "Now the matrix m is:" << endl << m << endl; + std::cout << "Now the matrix m is:" << std::endl << m << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/class_CwiseBinaryOp.cpp b/libs/eigen/doc/examples/class_CwiseBinaryOp.cpp index 682af46..973befd 100644 --- a/libs/eigen/doc/examples/class_CwiseBinaryOp.cpp +++ b/libs/eigen/doc/examples/class_CwiseBinaryOp.cpp @@ -1,18 +1,17 @@ #include #include -using namespace Eigen; -using namespace std; + +using Eigen::Matrix4d; // define a custom template binary functor template struct MakeComplexOp { - EIGEN_EMPTY_STRUCT_CTOR(MakeComplexOp) - typedef complex result_type; - complex operator()(const Scalar& a, const Scalar& b) const { return complex(a,b); } + typedef std::complex result_type; + result_type operator()(const Scalar& a, const Scalar& b) const { return result_type(a,b); } }; int main(int, char**) { Matrix4d m1 = Matrix4d::Random(), m2 = Matrix4d::Random(); - cout << m1.binaryExpr(m2, MakeComplexOp()) << endl; + std::cout << m1.binaryExpr(m2, MakeComplexOp()) << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/class_CwiseUnaryOp.cpp b/libs/eigen/doc/examples/class_CwiseUnaryOp.cpp index a5fcc15..6c65f2e 100644 --- a/libs/eigen/doc/examples/class_CwiseUnaryOp.cpp +++ b/libs/eigen/doc/examples/class_CwiseUnaryOp.cpp @@ -1,7 +1,5 @@ #include #include -using namespace Eigen; -using namespace std; // define a custom template unary functor template @@ -13,7 +11,7 @@ struct CwiseClampOp { int main(int, char**) { - Matrix4d m1 = Matrix4d::Random(); - cout << m1 << endl << "becomes: " << endl << m1.unaryExpr(CwiseClampOp(-0.5,0.5)) << endl; + Eigen::Matrix4d m1 = Eigen::Matrix4d::Random(); + std::cout << m1 << std::endl << "becomes: " << std::endl << m1.unaryExpr(CwiseClampOp(-0.5,0.5)) << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/class_CwiseUnaryOp_ptrfun.cpp b/libs/eigen/doc/examples/class_CwiseUnaryOp_ptrfun.cpp index 36706d8..e97095e 100644 --- a/libs/eigen/doc/examples/class_CwiseUnaryOp_ptrfun.cpp +++ b/libs/eigen/doc/examples/class_CwiseUnaryOp_ptrfun.cpp @@ -1,7 +1,5 @@ #include #include -using namespace Eigen; -using namespace std; // define function to be applied coefficient-wise double ramp(double x) @@ -14,7 +12,7 @@ double ramp(double x) int main(int, char**) { - Matrix4d m1 = Matrix4d::Random(); - cout << m1 << endl << "becomes: " << endl << m1.unaryExpr(ptr_fun(ramp)) << endl; + Eigen::Matrix4d m1 = Eigen::Matrix4d::Random(); + std::cout << m1 << std::endl << "becomes: " << std::endl << m1.unaryExpr(std::ptr_fun(ramp)) << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/class_FixedBlock.cpp b/libs/eigen/doc/examples/class_FixedBlock.cpp index 9978b32..4bb2d44 100644 --- a/libs/eigen/doc/examples/class_FixedBlock.cpp +++ b/libs/eigen/doc/examples/class_FixedBlock.cpp @@ -1,27 +1,25 @@ #include #include -using namespace Eigen; -using namespace std; template Eigen::Block -topLeft2x2Corner(MatrixBase& m) +topLeft2x2Corner(Eigen::MatrixBase& m) { return Eigen::Block(m.derived(), 0, 0); } template const Eigen::Block -topLeft2x2Corner(const MatrixBase& m) +topLeft2x2Corner(const Eigen::MatrixBase& m) { return Eigen::Block(m.derived(), 0, 0); } int main(int, char**) { - Matrix3d m = Matrix3d::Identity(); - cout << topLeft2x2Corner(4*m) << endl; // calls the const version + Eigen::Matrix3d m = Eigen::Matrix3d::Identity(); + std::cout << topLeft2x2Corner(4*m) << std::endl; // calls the const version topLeft2x2Corner(m) *= 2; // calls the non-const version - cout << "Now the matrix m is:" << endl << m << endl; + std::cout << "Now the matrix m is:" << std::endl << m << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/class_FixedReshaped.cpp b/libs/eigen/doc/examples/class_FixedReshaped.cpp index b6d4085..be7069d 100644 --- a/libs/eigen/doc/examples/class_FixedReshaped.cpp +++ b/libs/eigen/doc/examples/class_FixedReshaped.cpp @@ -1,22 +1,20 @@ #include #include -using namespace Eigen; -using namespace std; template Eigen::Reshaped -reshape_helper(MatrixBase& m) +reshape_helper(Eigen::MatrixBase& m) { return Eigen::Reshaped(m.derived()); } int main(int, char**) { - MatrixXd m(2, 4); + Eigen::MatrixXd m(2, 4); m << 1, 2, 3, 4, 5, 6, 7, 8; - MatrixXd n = reshape_helper(m); - cout << "matrix m is:" << endl << m << endl; - cout << "matrix n is:" << endl << n << endl; + Eigen::MatrixXd n = reshape_helper(m); + std::cout << "matrix m is:" << std::endl << m << std::endl; + std::cout << "matrix n is:" << std::endl << n << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/class_FixedVectorBlock.cpp b/libs/eigen/doc/examples/class_FixedVectorBlock.cpp index c88c9fb..eed3007 100644 --- a/libs/eigen/doc/examples/class_FixedVectorBlock.cpp +++ b/libs/eigen/doc/examples/class_FixedVectorBlock.cpp @@ -1,27 +1,25 @@ #include #include -using namespace Eigen; -using namespace std; template Eigen::VectorBlock -firstTwo(MatrixBase& v) +firstTwo(Eigen::MatrixBase& v) { return Eigen::VectorBlock(v.derived(), 0); } template const Eigen::VectorBlock -firstTwo(const MatrixBase& v) +firstTwo(const Eigen::MatrixBase& v) { return Eigen::VectorBlock(v.derived(), 0); } int main(int, char**) { - Matrix v; v << 1,2,3,4,5,6; - cout << firstTwo(4*v) << endl; // calls the const version + Eigen::Matrix v; v << 1,2,3,4,5,6; + std::cout << firstTwo(4*v) << std::endl; // calls the const version firstTwo(v) *= 2; // calls the non-const version - cout << "Now the vector v is:" << endl << v << endl; + std::cout << "Now the vector v is:" << std::endl << v << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/class_Reshaped.cpp b/libs/eigen/doc/examples/class_Reshaped.cpp index 18fb454..7219853 100644 --- a/libs/eigen/doc/examples/class_Reshaped.cpp +++ b/libs/eigen/doc/examples/class_Reshaped.cpp @@ -1,23 +1,21 @@ #include #include -using namespace std; -using namespace Eigen; template -const Reshaped -reshape_helper(const MatrixBase& m, int rows, int cols) +const Eigen::Reshaped +reshape_helper(const Eigen::MatrixBase& m, int rows, int cols) { - return Reshaped(m.derived(), rows, cols); + return Eigen::Reshaped(m.derived(), rows, cols); } int main(int, char**) { - MatrixXd m(3, 4); + Eigen::MatrixXd m(3, 4); m << 1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12; - cout << m << endl; - Ref n = reshape_helper(m, 2, 6); - cout << "Matrix m is:" << endl << m << endl; - cout << "Matrix n is:" << endl << n << endl; + std::cout << m << std::endl; + Eigen::Ref n = reshape_helper(m, 2, 6); + std::cout << "Matrix m is:" << std::endl << m << std::endl; + std::cout << "Matrix n is:" << std::endl << n << std::endl; } diff --git a/libs/eigen/doc/examples/class_VectorBlock.cpp b/libs/eigen/doc/examples/class_VectorBlock.cpp index dc213df..5cee147 100644 --- a/libs/eigen/doc/examples/class_VectorBlock.cpp +++ b/libs/eigen/doc/examples/class_VectorBlock.cpp @@ -1,27 +1,25 @@ #include #include -using namespace Eigen; -using namespace std; template Eigen::VectorBlock -segmentFromRange(MatrixBase& v, int start, int end) +segmentFromRange(Eigen::MatrixBase& v, int start, int end) { return Eigen::VectorBlock(v.derived(), start, end-start); } template const Eigen::VectorBlock -segmentFromRange(const MatrixBase& v, int start, int end) +segmentFromRange(const Eigen::MatrixBase& v, int start, int end) { return Eigen::VectorBlock(v.derived(), start, end-start); } int main(int, char**) { - Matrix v; v << 1,2,3,4,5,6; - cout << segmentFromRange(2*v, 2, 4) << endl; // calls the const version + Eigen::Matrix v; v << 1,2,3,4,5,6; + std::cout << segmentFromRange(2*v, 2, 4) << std::endl; // calls the const version segmentFromRange(v, 1, 3) *= 5; // calls the non-const version - cout << "Now the vector v is:" << endl << v << endl; + std::cout << "Now the vector v is:" << std::endl << v << std::endl; return 0; } diff --git a/libs/eigen/doc/examples/function_taking_eigenbase.cpp b/libs/eigen/doc/examples/function_taking_eigenbase.cpp index 49d94b3..4e1e5a9 100644 --- a/libs/eigen/doc/examples/function_taking_eigenbase.cpp +++ b/libs/eigen/doc/examples/function_taking_eigenbase.cpp @@ -1,9 +1,8 @@ #include #include -using namespace Eigen; template -void print_size(const EigenBase& b) +void print_size(const Eigen::EigenBase& b) { std::cout << "size (rows, cols): " << b.size() << " (" << b.rows() << ", " << b.cols() << ")" << std::endl; @@ -11,7 +10,7 @@ void print_size(const EigenBase& b) int main() { - Vector3f v; + Eigen::Vector3f v; print_size(v); // v.asDiagonal() returns a 3x3 diagonal matrix pseudo-expression print_size(v.asDiagonal()); diff --git a/libs/eigen/doc/examples/function_taking_ref.cpp b/libs/eigen/doc/examples/function_taking_ref.cpp index 162a202..a837e19 100644 --- a/libs/eigen/doc/examples/function_taking_ref.cpp +++ b/libs/eigen/doc/examples/function_taking_ref.cpp @@ -1,19 +1,17 @@ #include #include -using namespace Eigen; -using namespace std; -float inv_cond(const Ref& a) +float inv_cond(const Eigen::Ref& a) { - const VectorXf sing_vals = a.jacobiSvd().singularValues(); + const Eigen::VectorXf sing_vals = a.jacobiSvd().singularValues(); return sing_vals(sing_vals.size()-1) / sing_vals(0); } int main() { - Matrix4f m = Matrix4f::Random(); - cout << "matrix m:" << endl << m << endl << endl; - cout << "inv_cond(m): " << inv_cond(m) << endl; - cout << "inv_cond(m(1:3,1:3)): " << inv_cond(m.topLeftCorner(3,3)) << endl; - cout << "inv_cond(m+I): " << inv_cond(m+Matrix4f::Identity()) << endl; + Eigen::MatrixXf m = Eigen::MatrixXf::Random(4, 4); + std::cout << "matrix m:\n" << m << "\n\n"; + std::cout << "inv_cond(m): " << inv_cond(m) << "\n"; + std::cout << "inv_cond(m(1:3,1:3)): " << inv_cond(m.topLeftCorner(3,3)) << "\n"; + std::cout << "inv_cond(m+I): " << inv_cond(m+Eigen::MatrixXf::Identity(4, 4)) << "\n"; } diff --git a/libs/eigen/doc/examples/make_circulant.cpp.evaluator b/libs/eigen/doc/examples/make_circulant.cpp.evaluator index 2ba79e7..cd461b9 100644 --- a/libs/eigen/doc/examples/make_circulant.cpp.evaluator +++ b/libs/eigen/doc/examples/make_circulant.cpp.evaluator @@ -6,7 +6,7 @@ namespace Eigen { { typedef Circulant XprType; typedef typename nested_eval::type ArgTypeNested; - typedef typename remove_all::type ArgTypeNestedCleaned; + typedef remove_all_t ArgTypeNestedCleaned; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { diff --git a/libs/eigen/doc/examples/make_circulant2.cpp b/libs/eigen/doc/examples/make_circulant2.cpp index 95d3dd3..d86a66b 100644 --- a/libs/eigen/doc/examples/make_circulant2.cpp +++ b/libs/eigen/doc/examples/make_circulant2.cpp @@ -1,8 +1,6 @@ #include #include -using namespace Eigen; - // [circulant_func] template class circulant_functor { @@ -10,8 +8,8 @@ class circulant_functor { public: circulant_functor(const ArgType& arg) : m_vec(arg) {} - const typename ArgType::Scalar& operator() (Index row, Index col) const { - Index index = row - col; + const typename ArgType::Scalar& operator() (Eigen::Index row, Eigen::Index col) const { + Eigen::Index index = row - col; if (index < 0) index += m_vec.size(); return m_vec(index); } @@ -21,10 +19,10 @@ public: // [square] template struct circulant_helper { - typedef Matrix MatrixType; }; @@ -32,7 +30,7 @@ struct circulant_helper { // [makeCirculant] template -CwiseNullaryOp, typename circulant_helper::MatrixType> +Eigen::CwiseNullaryOp, typename circulant_helper::MatrixType> makeCirculant(const Eigen::MatrixBase& arg) { typedef typename circulant_helper::MatrixType MatrixType; diff --git a/libs/eigen/doc/examples/nullary_indexing.cpp b/libs/eigen/doc/examples/nullary_indexing.cpp index b74db5f..38260af 100644 --- a/libs/eigen/doc/examples/nullary_indexing.cpp +++ b/libs/eigen/doc/examples/nullary_indexing.cpp @@ -1,8 +1,6 @@ #include #include -using namespace Eigen; - // [functor] template class indexing_functor { @@ -10,10 +8,10 @@ class indexing_functor { const RowIndexType &m_rowIndices; const ColIndexType &m_colIndices; public: - typedef Matrix MatrixType; @@ -21,7 +19,7 @@ public: : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices) {} - const typename ArgType::Scalar& operator() (Index row, Index col) const { + const typename ArgType::Scalar& operator() (Eigen::Index row, Eigen::Index col) const { return m_arg(m_rowIndices[row], m_colIndices[col]); } }; @@ -29,7 +27,7 @@ public: // [function] template -CwiseNullaryOp, typename indexing_functor::MatrixType> +Eigen::CwiseNullaryOp, typename indexing_functor::MatrixType> mat_indexing(const Eigen::MatrixBase& arg, const RowIndexType& row_indices, const ColIndexType& col_indices) { typedef indexing_functor Func; @@ -43,8 +41,8 @@ int main() { std::cout << "[main1]\n"; Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4); - Array3i ri(1,2,1); - ArrayXi ci(6); ci << 3,2,1,0,0,2; + Eigen::Array3i ri(1,2,1); + Eigen::ArrayXi ci(6); ci << 3,2,1,0,0,2; Eigen::MatrixXi B = mat_indexing(A, ri, ci); std::cout << "A =" << std::endl; std::cout << A << std::endl << std::endl; @@ -56,11 +54,9 @@ int main() B = mat_indexing(A, ri+1, ci); std::cout << "A(ri+1,ci) =" << std::endl; std::cout << B << std::endl << std::endl; -#if EIGEN_COMP_CXXVER >= 11 - B = mat_indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)); + B = mat_indexing(A, Eigen::ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), Eigen::ArrayXi::LinSpaced(4,0,3)); std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl; std::cout << B << std::endl << std::endl; -#endif std::cout << "[main2]\n"; } diff --git a/libs/eigen/doc/examples/tut_arithmetic_add_sub.cpp b/libs/eigen/doc/examples/tut_arithmetic_add_sub.cpp index e97477b..95162c0 100644 --- a/libs/eigen/doc/examples/tut_arithmetic_add_sub.cpp +++ b/libs/eigen/doc/examples/tut_arithmetic_add_sub.cpp @@ -1,14 +1,12 @@ #include #include -using namespace Eigen; - int main() { - Matrix2d a; + Eigen::Matrix2d a; a << 1, 2, 3, 4; - MatrixXd b(2,2); + Eigen::MatrixXd b(2,2); b << 2, 3, 1, 4; std::cout << "a + b =\n" << a + b << std::endl; @@ -16,7 +14,7 @@ int main() std::cout << "Doing a += b;" << std::endl; a += b; std::cout << "Now a =\n" << a << std::endl; - Vector3d v(1,2,3); - Vector3d w(1,0,0); + Eigen::Vector3d v(1,2,3); + Eigen::Vector3d w(1,0,0); std::cout << "-v + w - v =\n" << -v + w - v << std::endl; } diff --git a/libs/eigen/doc/examples/tut_arithmetic_dot_cross.cpp b/libs/eigen/doc/examples/tut_arithmetic_dot_cross.cpp index 631c9a5..d95e03c 100644 --- a/libs/eigen/doc/examples/tut_arithmetic_dot_cross.cpp +++ b/libs/eigen/doc/examples/tut_arithmetic_dot_cross.cpp @@ -1,15 +1,18 @@ #include #include -using namespace Eigen; -using namespace std; int main() { - Vector3d v(1,2,3); - Vector3d w(0,1,2); + Eigen::Vector3d v(1,2,3); + Eigen::Vector3d w(0,1,2); - cout << "Dot product: " << v.dot(w) << endl; + std::cout << "Dot product: " << v.dot(w) << std::endl; double dp = v.adjoint()*w; // automatic conversion of the inner product to a scalar - cout << "Dot product via a matrix product: " << dp << endl; - cout << "Cross product:\n" << v.cross(w) << endl; + std::cout << "Dot product via a matrix product: " << dp << std::endl; + + std::cout << "Cross product:\n" << v.cross(w) << std::endl; + Eigen::Vector2d v2(1,2); + Eigen::Vector2d w2(0,1); + double cp = v2.cross(w2); // returning a scalar between size-2 vectors + std::cout << "Cross product for 2D vectors: " << cp << std::endl; } diff --git a/libs/eigen/doc/examples/tut_arithmetic_matrix_mul.cpp b/libs/eigen/doc/examples/tut_arithmetic_matrix_mul.cpp index f213902..c2d5e2d 100644 --- a/libs/eigen/doc/examples/tut_arithmetic_matrix_mul.cpp +++ b/libs/eigen/doc/examples/tut_arithmetic_matrix_mul.cpp @@ -1,13 +1,12 @@ #include #include -using namespace Eigen; int main() { - Matrix2d mat; + Eigen::Matrix2d mat; mat << 1, 2, 3, 4; - Vector2d u(-1,1), v(2,0); + Eigen::Vector2d u(-1,1), v(2,0); std::cout << "Here is mat*mat:\n" << mat*mat << std::endl; std::cout << "Here is mat*u:\n" << mat*u << std::endl; std::cout << "Here is u^T*mat:\n" << u.transpose()*mat << std::endl; diff --git a/libs/eigen/doc/examples/tut_arithmetic_scalar_mul_div.cpp b/libs/eigen/doc/examples/tut_arithmetic_scalar_mul_div.cpp index d5f65b5..0ba8d6b 100644 --- a/libs/eigen/doc/examples/tut_arithmetic_scalar_mul_div.cpp +++ b/libs/eigen/doc/examples/tut_arithmetic_scalar_mul_div.cpp @@ -1,14 +1,12 @@ #include #include -using namespace Eigen; - int main() { - Matrix2d a; + Eigen::Matrix2d a; a << 1, 2, 3, 4; - Vector3d v(1,2,3); + Eigen::Vector3d v(1,2,3); std::cout << "a * 2.5 =\n" << a * 2.5 << std::endl; std::cout << "0.1 * v =\n" << 0.1 * v << std::endl; std::cout << "Doing v *= 2;" << std::endl; diff --git a/libs/eigen/doc/examples/tut_matrix_coefficient_accessors.cpp b/libs/eigen/doc/examples/tut_matrix_coefficient_accessors.cpp index c2da171..040087c 100644 --- a/libs/eigen/doc/examples/tut_matrix_coefficient_accessors.cpp +++ b/libs/eigen/doc/examples/tut_matrix_coefficient_accessors.cpp @@ -1,17 +1,15 @@ #include #include -using namespace Eigen; - int main() { - MatrixXd m(2,2); + Eigen::MatrixXd m(2,2); m(0,0) = 3; m(1,0) = 2.5; m(0,1) = -1; m(1,1) = m(1,0) + m(0,1); std::cout << "Here is the matrix m:\n" << m << std::endl; - VectorXd v(2); + Eigen::VectorXd v(2); v(0) = 4; v(1) = v(0) - 1; std::cout << "Here is the vector v:\n" << v << std::endl; diff --git a/libs/eigen/doc/examples/tut_matrix_resize.cpp b/libs/eigen/doc/examples/tut_matrix_resize.cpp index 0392c3a..aa80cf5 100644 --- a/libs/eigen/doc/examples/tut_matrix_resize.cpp +++ b/libs/eigen/doc/examples/tut_matrix_resize.cpp @@ -1,16 +1,14 @@ #include #include -using namespace Eigen; - int main() { - MatrixXd m(2,5); + Eigen::MatrixXd m(2,5); m.resize(4,3); std::cout << "The matrix m is of size " << m.rows() << "x" << m.cols() << std::endl; std::cout << "It has " << m.size() << " coefficients" << std::endl; - VectorXd v(2); + Eigen::VectorXd v(2); v.resize(5); std::cout << "The vector v is of size " << v.size() << std::endl; std::cout << "As a matrix, v is of size " diff --git a/libs/eigen/doc/examples/tut_matrix_resize_fixed_size.cpp b/libs/eigen/doc/examples/tut_matrix_resize_fixed_size.cpp index dcbdfa7..3df87d2 100644 --- a/libs/eigen/doc/examples/tut_matrix_resize_fixed_size.cpp +++ b/libs/eigen/doc/examples/tut_matrix_resize_fixed_size.cpp @@ -1,11 +1,9 @@ #include #include -using namespace Eigen; - int main() { - Matrix4d m; + Eigen::Matrix4d m; m.resize(4,4); // no operation std::cout << "The matrix m is of size " << m.rows() << "x" << m.cols() << std::endl; diff --git a/libs/eigen/doc/snippets/CMakeLists.txt b/libs/eigen/doc/snippets/CMakeLists.txt index 65f195a..868d669 100644 --- a/libs/eigen/doc/snippets/CMakeLists.txt +++ b/libs/eigen/doc/snippets/CMakeLists.txt @@ -6,31 +6,26 @@ foreach(snippet_src ${snippets_SRCS}) get_filename_component(snippet ${snippet_src} NAME_WE) set(compile_snippet_target compile_${snippet}) set(compile_snippet_src ${compile_snippet_target}.cpp) - if((NOT ${snippet_src} MATCHES "cxx11") OR EIGEN_COMPILER_SUPPORT_CPP11) - file(READ ${snippet_src} snippet_source_code) - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/compile_snippet.cpp.in - ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}) - add_executable(${compile_snippet_target} - ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}) - if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) - target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) - endif() - if(${snippet_src} MATCHES "cxx11") - set_target_properties(${compile_snippet_target} PROPERTIES COMPILE_FLAGS "-std=c++11") - endif() - if(${snippet_src} MATCHES "deprecated") - set_target_properties(${compile_snippet_target} PROPERTIES COMPILE_FLAGS "-DEIGEN_NO_DEPRECATED_WARNING") - endif() - add_custom_command( - TARGET ${compile_snippet_target} - POST_BUILD - COMMAND ${compile_snippet_target} - ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out - ) - add_dependencies(all_snippets ${compile_snippet_target}) - set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src} - PROPERTIES OBJECT_DEPENDS ${snippet_src}) - else() - message("skip snippet ${snippet_src} because compiler does not support C++11") + + file(READ ${snippet_src} snippet_source_code) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/compile_snippet.cpp.in + ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}) + add_executable(${compile_snippet_target} + ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}) + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) endif() + + if(${snippet_src} MATCHES "deprecated") + set_target_properties(${compile_snippet_target} PROPERTIES COMPILE_FLAGS "-DEIGEN_NO_DEPRECATED_WARNING") + endif() + add_custom_command( + TARGET ${compile_snippet_target} + POST_BUILD + COMMAND ${compile_snippet_target} + ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out + ) + add_dependencies(all_snippets ${compile_snippet_target}) + set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src} + PROPERTIES OBJECT_DEPENDS ${snippet_src}) endforeach() diff --git a/libs/eigen/doc/snippets/Cwise_array_atan2_array.cpp b/libs/eigen/doc/snippets/Cwise_array_atan2_array.cpp new file mode 100644 index 0000000..ace075a --- /dev/null +++ b/libs/eigen/doc/snippets/Cwise_array_atan2_array.cpp @@ -0,0 +1,4 @@ +Array x(8,-25,3), + y(1./3.,0.5,-2.); +cout << "atan2([" << x << "], [" << y << "]) = " << x.atan2(y) << endl; // using ArrayBase::pow +cout << "atan2([" << x << "], [" << y << "] = " << atan2(x,y) << endl; // using Eigen::pow diff --git a/libs/eigen/doc/snippets/JacobiSVD_basic.cpp b/libs/eigen/doc/snippets/JacobiSVD_basic.cpp index ab24b9b..6c21baf 100644 --- a/libs/eigen/doc/snippets/JacobiSVD_basic.cpp +++ b/libs/eigen/doc/snippets/JacobiSVD_basic.cpp @@ -1,6 +1,6 @@ MatrixXf m = MatrixXf::Random(3,2); cout << "Here is the matrix m:" << endl << m << endl; -JacobiSVD svd(m, ComputeThinU | ComputeThinV); +JacobiSVD svd(m); cout << "Its singular values are:" << endl << svd.singularValues() << endl; cout << "Its left singular vectors are the columns of the thin U matrix:" << endl << svd.matrixU() << endl; cout << "Its right singular vectors are the columns of the thin V matrix:" << endl << svd.matrixV() << endl; diff --git a/libs/eigen/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp b/libs/eigen/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp index bbb821e..cc0c50e 100644 --- a/libs/eigen/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp +++ b/libs/eigen/doc/snippets/SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp @@ -3,7 +3,7 @@ MatrixXd A = X + X.transpose(); cout << "Here is a random symmetric matrix, A:" << endl << A << endl; X = MatrixXd::Random(5,5); MatrixXd B = X * X.transpose(); -cout << "and a random postive-definite matrix, B:" << endl << B << endl << endl; +cout << "and a random positive-definite matrix, B:" << endl << B << endl << endl; GeneralizedSelfAdjointEigenSolver es(A,B); cout << "The eigenvalues of the pencil (A,B) are:" << endl << es.eigenvalues() << endl; diff --git a/libs/eigen/doc/snippets/Slicing_arrayexpr.cpp b/libs/eigen/doc/snippets/Slicing_arrayexpr.cpp index 2df8180..6d09980 100644 --- a/libs/eigen/doc/snippets/Slicing_arrayexpr.cpp +++ b/libs/eigen/doc/snippets/Slicing_arrayexpr.cpp @@ -1,4 +1,4 @@ ArrayXi ind(5); ind<<4,2,5,5,3; MatrixXi A = MatrixXi::Random(4,6); cout << "Initial matrix A:\n" << A << "\n\n"; -cout << "A(all,ind-1):\n" << A(all,ind-1) << "\n\n"; +cout << "A(all,ind-1):\n" << A(Eigen::placeholders::all,ind-1) << "\n\n"; diff --git a/libs/eigen/doc/snippets/Slicing_rawarray_cxx11.cpp b/libs/eigen/doc/snippets/Slicing_rawarray_cxx11.cpp index 1087131..7a3e6e5 100644 --- a/libs/eigen/doc/snippets/Slicing_rawarray_cxx11.cpp +++ b/libs/eigen/doc/snippets/Slicing_rawarray_cxx11.cpp @@ -1,5 +1,3 @@ -#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE MatrixXi A = MatrixXi::Random(4,6); cout << "Initial matrix A:\n" << A << "\n\n"; -cout << "A(all,{4,2,5,5,3}):\n" << A(all,{4,2,5,5,3}) << "\n\n"; -#endif +cout << "A(all,{4,2,5,5,3}):\n" << A(Eigen::placeholders::all,{4,2,5,5,3}) << "\n\n"; diff --git a/libs/eigen/doc/snippets/Slicing_stdvector_cxx11.cpp b/libs/eigen/doc/snippets/Slicing_stdvector_cxx11.cpp index 555f662..74f0727 100644 --- a/libs/eigen/doc/snippets/Slicing_stdvector_cxx11.cpp +++ b/libs/eigen/doc/snippets/Slicing_stdvector_cxx11.cpp @@ -1,4 +1,4 @@ std::vector ind{4,2,5,5,3}; MatrixXi A = MatrixXi::Random(4,6); cout << "Initial matrix A:\n" << A << "\n\n"; -cout << "A(all,ind):\n" << A(all,ind) << "\n\n"; +cout << "A(all,ind):\n" << A(Eigen::placeholders::all,ind) << "\n\n"; diff --git a/libs/eigen/doc/special_examples/CMakeLists.txt b/libs/eigen/doc/special_examples/CMakeLists.txt index 5b00e8b..e6407aa 100644 --- a/libs/eigen/doc/special_examples/CMakeLists.txt +++ b/libs/eigen/doc/special_examples/CMakeLists.txt @@ -19,16 +19,13 @@ if(QT4_FOUND) add_dependencies(all_examples Tutorial_sparse_example) endif() -if(EIGEN_COMPILER_SUPPORT_CPP11) - add_executable(random_cpp11 random_cpp11.cpp) - target_link_libraries(random_cpp11 ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) - add_dependencies(all_examples random_cpp11) - ei_add_target_property(random_cpp11 COMPILE_FLAGS "-std=c++11") +add_executable(random_cpp11 random_cpp11.cpp) +target_link_libraries(random_cpp11 ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) +add_dependencies(all_examples random_cpp11) - add_custom_command( - TARGET random_cpp11 - POST_BUILD - COMMAND random_cpp11 - ARGS >${CMAKE_CURRENT_BINARY_DIR}/random_cpp11.out - ) -endif() +add_custom_command( + TARGET random_cpp11 + POST_BUILD + COMMAND random_cpp11 + ARGS >${CMAKE_CURRENT_BINARY_DIR}/random_cpp11.out +) diff --git a/libs/eigen/doc/special_examples/random_cpp11.cpp b/libs/eigen/doc/special_examples/random_cpp11.cpp index 33744c0..bd73800 100644 --- a/libs/eigen/doc/special_examples/random_cpp11.cpp +++ b/libs/eigen/doc/special_examples/random_cpp11.cpp @@ -2,13 +2,11 @@ #include #include -using namespace Eigen; - int main() { std::default_random_engine generator; std::poisson_distribution distribution(4.1); auto poisson = [&] () {return distribution(generator);}; - RowVectorXi v = RowVectorXi::NullaryExpr(10, poisson ); + Eigen::RowVectorXi v = Eigen::RowVectorXi::NullaryExpr(10, poisson ); std::cout << v << "\n"; } diff --git a/libs/eigen/failtest/CMakeLists.txt b/libs/eigen/failtest/CMakeLists.txt index 256e541..2c5fc33 100644 --- a/libs/eigen/failtest/CMakeLists.txt +++ b/libs/eigen/failtest/CMakeLists.txt @@ -62,9 +62,5 @@ ei_add_failtest("jacobisvd_int") ei_add_failtest("bdcsvd_int") ei_add_failtest("eigensolver_int") ei_add_failtest("eigensolver_cplx") - -if(EIGEN_TEST_CXX11) - ei_add_failtest("initializer_list_1") - ei_add_failtest("initializer_list_2") -endif() - +ei_add_failtest("initializer_list_1") +ei_add_failtest("initializer_list_2") diff --git a/libs/eigen/lapack/CMakeLists.txt b/libs/eigen/lapack/CMakeLists.txt index e48497f..8d6d754 100644 --- a/libs/eigen/lapack/CMakeLists.txt +++ b/libs/eigen/lapack/CMakeLists.txt @@ -1,10 +1,18 @@ - project(EigenLapack CXX) +if(EIGEN_BUILD_LAPACK AND EIGEN_BUILD_BLAS) + include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) enable_language(Fortran) + if("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU") + if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER_EQUAL 10.0) + # We use an old version of LAPACK with argument type mismatches. + # Allow them to compile anyway with newer GNU versions. + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fallow-argument-mismatch") + endif() + endif() set(EIGEN_Fortran_COMPILER_WORKS ON) else() set(EIGEN_Fortran_COMPILER_WORKS OFF) @@ -145,6 +153,7 @@ if(EXISTS ${eigen_full_path_to_testing_lapack}) string(REPLACE "." "_" input_name ${input}) set(testName "${target}_${input_name}") if(EXISTS "${TEST_INPUT}") + add_dependencies(buildtests ${target}) add_test(NAME LAPACK-${testName} COMMAND "${CMAKE_COMMAND}" -DTEST=$ @@ -450,3 +459,6 @@ if(EXISTS ${eigen_full_path_to_testing_lapack}) endif() +elseif(EIGEN_BUILD_LAPACK AND NOT EIGEN_BUILD_BLAS) + message(FATAL_ERROR "EIGEN_BUILD_LAPACK requires EIGEN_BUILD_BLAS") +endif() #EIGEN_BUILD_LAPACK diff --git a/libs/eigen/lapack/cholesky.cpp b/libs/eigen/lapack/cholesky.inc similarity index 100% rename from libs/eigen/lapack/cholesky.cpp rename to libs/eigen/lapack/cholesky.inc diff --git a/libs/eigen/lapack/complex_double.cpp b/libs/eigen/lapack/complex_double.cpp index c9c5752..492f743 100644 --- a/libs/eigen/lapack/complex_double.cpp +++ b/libs/eigen/lapack/complex_double.cpp @@ -13,6 +13,6 @@ #define REAL_SCALAR_SUFFIX d #define ISCOMPLEX 1 -#include "cholesky.cpp" -#include "lu.cpp" -#include "svd.cpp" +#include "cholesky.inc" +#include "lu.inc" +#include "svd.inc" diff --git a/libs/eigen/lapack/complex_single.cpp b/libs/eigen/lapack/complex_single.cpp index 6d11b26..cdf989e 100644 --- a/libs/eigen/lapack/complex_single.cpp +++ b/libs/eigen/lapack/complex_single.cpp @@ -13,6 +13,6 @@ #define REAL_SCALAR_SUFFIX s #define ISCOMPLEX 1 -#include "cholesky.cpp" -#include "lu.cpp" -#include "svd.cpp" +#include "cholesky.inc" +#include "lu.inc" +#include "svd.inc" diff --git a/libs/eigen/lapack/double.cpp b/libs/eigen/lapack/double.cpp index ea78bb6..afabce3 100644 --- a/libs/eigen/lapack/double.cpp +++ b/libs/eigen/lapack/double.cpp @@ -12,7 +12,7 @@ #define SCALAR_SUFFIX_UP "D" #define ISCOMPLEX 0 -#include "cholesky.cpp" -#include "lu.cpp" -#include "eigenvalues.cpp" -#include "svd.cpp" +#include "cholesky.inc" +#include "lu.inc" +#include "eigenvalues.inc" +#include "svd.inc" diff --git a/libs/eigen/lapack/eigenvalues.cpp b/libs/eigen/lapack/eigenvalues.inc similarity index 100% rename from libs/eigen/lapack/eigenvalues.cpp rename to libs/eigen/lapack/eigenvalues.inc diff --git a/libs/eigen/lapack/lu.cpp b/libs/eigen/lapack/lu.inc similarity index 100% rename from libs/eigen/lapack/lu.cpp rename to libs/eigen/lapack/lu.inc diff --git a/libs/eigen/lapack/single.cpp b/libs/eigen/lapack/single.cpp index c7da3ef..2994436 100644 --- a/libs/eigen/lapack/single.cpp +++ b/libs/eigen/lapack/single.cpp @@ -12,7 +12,7 @@ #define SCALAR_SUFFIX_UP "S" #define ISCOMPLEX 0 -#include "cholesky.cpp" -#include "lu.cpp" -#include "eigenvalues.cpp" -#include "svd.cpp" +#include "cholesky.inc" +#include "lu.inc" +#include "eigenvalues.inc" +#include "svd.inc" diff --git a/libs/eigen/lapack/svd.cpp b/libs/eigen/lapack/svd.inc similarity index 99% rename from libs/eigen/lapack/svd.cpp rename to libs/eigen/lapack/svd.inc index 77b302b..83544cf 100644 --- a/libs/eigen/lapack/svd.cpp +++ b/libs/eigen/lapack/svd.inc @@ -135,4 +135,4 @@ EIGEN_LAPACK_FUNC(gesvd,(char *jobu, char *jobv, int *m, int* n, Scalar* a, int else if(*jobv=='O') matrix(a,diag_size,*n,*lda) = svd.matrixV().adjoint(); } return 0; -} +} \ No newline at end of file diff --git a/libs/eigen/test/AnnoyingScalar.h b/libs/eigen/test/AnnoyingScalar.h index 7ace083..4362de2 100644 --- a/libs/eigen/test/AnnoyingScalar.h +++ b/libs/eigen/test/AnnoyingScalar.h @@ -32,14 +32,12 @@ class AnnoyingScalar { public: AnnoyingScalar() { init(); *v = 0; } - AnnoyingScalar(long double _v) { init(); *v = _v; } - AnnoyingScalar(double _v) { init(); *v = _v; } + AnnoyingScalar(long double _v) { init(); *v = static_cast(_v); } + AnnoyingScalar(double _v) { init(); *v = static_cast(_v); } AnnoyingScalar(float _v) { init(); *v = _v; } - AnnoyingScalar(int _v) { init(); *v = _v; } - AnnoyingScalar(long _v) { init(); *v = _v; } - #if EIGEN_HAS_CXX11 - AnnoyingScalar(long long _v) { init(); *v = _v; } - #endif + AnnoyingScalar(int _v) { init(); *v = static_cast(_v); } + AnnoyingScalar(long _v) { init(); *v = static_cast(_v); } + AnnoyingScalar(long long _v) { init(); *v = static_cast(_v); } AnnoyingScalar(const AnnoyingScalar& other) { init(); *v = *(other.v); } ~AnnoyingScalar() { if(v!=&data) @@ -83,8 +81,8 @@ class AnnoyingScalar AnnoyingScalar& operator/=(const AnnoyingScalar& other) { *v /= *other.v; return *this; } AnnoyingScalar& operator= (const AnnoyingScalar& other) { *v = *other.v; return *this; } - bool operator==(const AnnoyingScalar& other) const { return *v == *other.v; } - bool operator!=(const AnnoyingScalar& other) const { return *v != *other.v; } + bool operator==(const AnnoyingScalar& other) const { return numext::equal_strict(*v, *other.v); } + bool operator!=(const AnnoyingScalar& other) const { return numext::not_equal_strict(*v, *other.v); } bool operator<=(const AnnoyingScalar& other) const { return *v <= *other.v; } bool operator< (const AnnoyingScalar& other) const { return *v < *other.v; } bool operator>=(const AnnoyingScalar& other) const { return *v >= *other.v; } diff --git a/libs/eigen/test/CMakeLists.txt b/libs/eigen/test/CMakeLists.txt index 5136f82..223a9f1 100644 --- a/libs/eigen/test/CMakeLists.txt +++ b/libs/eigen/test/CMakeLists.txt @@ -42,45 +42,53 @@ endif() set(SPARSE_LIBS " ") find_package(CHOLMOD) -if(CHOLMOD_FOUND) +if(CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK) add_definitions("-DEIGEN_CHOLMOD_SUPPORT") include_directories(${CHOLMOD_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ") + + ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ") endif() find_package(UMFPACK) -if(UMFPACK_FOUND) +if(UMFPACK_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_UMFPACK_SUPPORT") include_directories(${UMFPACK_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ") + + ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ") endif() find_package(KLU) -if(KLU_FOUND) +if(KLU_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_KLU_SUPPORT") include_directories(${KLU_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") + + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") endif() find_package(SuperLU 4.0) -if(SuperLU_FOUND) +if(SuperLU_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_SUPERLU_SUPPORT") include_directories(${SUPERLU_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ") + + ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ") endif() @@ -124,7 +132,7 @@ else() endif() find_package(SPQR) -if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) +if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) add_definitions("-DEIGEN_SPQR_SUPPORT") include_directories(${SPQR_INCLUDES}) set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) @@ -134,6 +142,17 @@ else() ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ") endif() +find_package(Accelerate) +if(Accelerate_FOUND) + add_definitions("-DEIGEN_ACCELERATE_SUPPORT") + include_directories(${Accelerate_INCLUDES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${Accelerate_LIBRARIES}) + set(Accelerate_ALL_LIBS ${Accelerate_LIBRARIES}) + ei_add_property(EIGEN_TESTED_BACKENDS "Accelerate, ") +else() + ei_add_property(EIGEN_MISSING_BACKENDS "Accelerate, ") +endif() + option(EIGEN_TEST_NOQT "Disable Qt support in unit tests" OFF) if(NOT EIGEN_TEST_NOQT) find_package(Qt4) @@ -166,6 +185,7 @@ ei_add_test(io) ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") ei_add_test(vectorization_logic) ei_add_test(basicstuff) +ei_add_test(constexpr) ei_add_test(constructor) ei_add_test(linearstructure) ei_add_test(integer_types) @@ -187,6 +207,7 @@ ei_add_test(product_small) ei_add_test(product_large) ei_add_test(product_extra) ei_add_test(diagonalmatrices) +ei_add_test(skew_symmetric_matrix3) ei_add_test(adjoint) ei_add_test(diagonal) ei_add_test(miscmatrices) @@ -194,6 +215,7 @@ ei_add_test(commainitializer) ei_add_test(smallvectors) ei_add_test(mapped_matrix) ei_add_test(mapstride) +ei_add_test(unaryviewstride) ei_add_test(mapstaticmethods) ei_add_test(array_cwise) ei_add_test(array_for_matrix) @@ -285,10 +307,11 @@ ei_add_test(array_of_string) ei_add_test(num_dimensions) ei_add_test(stl_iterators) ei_add_test(blasutil) -if(EIGEN_TEST_CXX11) - ei_add_test(initializer_list_construction) - ei_add_test(diagonal_matrix_variadic_ctor) -endif() +ei_add_test(random_matrix) +ei_add_test(initializer_list_construction) +ei_add_test(diagonal_matrix_variadic_ctor) +ei_add_test(serializer) +ei_add_test(tuple_test) add_executable(bug1213 bug1213.cpp bug1213_main.cpp) @@ -302,7 +325,7 @@ else() endif() endif() -ei_add_test(fastmath " ${EIGEN_FASTMATH_FLAGS} ") +ei_add_test(fastmath "${EIGEN_FASTMATH_FLAGS}") # # ei_add_test(denseLM) @@ -310,22 +333,6 @@ if(QT4_FOUND) ei_add_test(qtvector "" "${QT_QTCORE_LIBRARY}") endif() -if(UMFPACK_FOUND) - ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") -endif() - -if(KLU_FOUND OR SuiteSparse_FOUND) - ei_add_test(klu_support "" "${KLU_ALL_LIBS}") -endif() - -if(SUPERLU_FOUND) - ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") -endif() - -if(CHOLMOD_FOUND) - ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") -endif() - if(PARDISO_FOUND) ei_add_test(pardiso_support "" "${PARDISO_ALL_LIBS}") endif() @@ -334,7 +341,7 @@ if(PASTIX_FOUND AND (SCOTCH_FOUND OR METIS_FOUND)) ei_add_test(pastix_support "" "${PASTIX_ALL_LIBS}") endif() -if(SPQR_FOUND AND CHOLMOD_FOUND) +if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK) ei_add_test(spqr_support "" "${SPQR_ALL_LIBS}") endif() @@ -342,6 +349,10 @@ if(METIS_FOUND) ei_add_test(metis_support "" "${METIS_LIBRARIES}") endif() +if(Accelerate_FOUND) + ei_add_test(accelerate_support "" "${Accelerate_ALL_LIBS}") +endif() + string(TOLOWER "${CMAKE_CXX_COMPILER}" cmake_cxx_compiler_tolower) if(cmake_cxx_compiler_tolower MATCHES "qcc") set(CXX_IS_QCC "ON") @@ -383,43 +394,51 @@ if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang") message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.") endif() -if(EIGEN_TEST_CUDA) +find_package(CUDA 9.0) +if(CUDA_FOUND AND EIGEN_TEST_CUDA) + # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor + # and -fno-check-new flags since they trigger thousands of compilation warnings + # in the CUDA runtime + string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -find_package(CUDA 5.0) -if(CUDA_FOUND) - - set(CUDA_PROPAGATE_HOST_FLAGS OFF) - - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") - if (${CUDA_VERSION} STREQUAL "7.0") - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") - endif() - - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) - endif() if(EIGEN_TEST_CUDA_CLANG) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}") endforeach() + string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}") else() - foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) - string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}") + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(NVCC_ARCH_FLAGS) + # Define an -arch=sm_, otherwise if GPU does not exactly match one of + # those in the arch list for -gencode, the kernels will fail to run with + # cudaErrorNoKernelImageForDevice + # This can happen with newer cards (e.g. sm_75) and compiling with older + # versions of nvcc (e.g. 9.2) that do not support their specific arch. + list(LENGTH EIGEN_CUDA_COMPUTE_ARCH EIGEN_CUDA_COMPUTE_ARCH_SIZE) + if(EIGEN_CUDA_COMPUTE_ARCH_SIZE) + list(GET EIGEN_CUDA_COMPUTE_ARCH 0 EIGEN_CUDA_COMPUTE_DEFAULT) + set(NVCC_ARCH_FLAGS " -arch=sm_${EIGEN_CUDA_COMPUTE_DEFAULT}") + endif() + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") endforeach() + set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}") + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() - string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + ei_add_test(gpu_example) ei_add_test(gpu_basic) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() -endif() - # HIP unit tests option(EIGEN_TEST_HIP "Add HIP support." OFF) @@ -442,6 +461,7 @@ if (EIGEN_TEST_HIP) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(gpu_basic) + ei_add_test(gpu_example) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) diff --git a/libs/eigen/test/OffByOneScalar.h b/libs/eigen/test/OffByOneScalar.h new file mode 100644 index 0000000..c0371a6 --- /dev/null +++ b/libs/eigen/test/OffByOneScalar.h @@ -0,0 +1,28 @@ + +// A Scalar with internal representation T+1 so that zero is internally +// represented by T(1). This is used to test memory fill. +// +template +class OffByOneScalar { + public: + OffByOneScalar() : val_(1) {} + OffByOneScalar(const OffByOneScalar& other) { + *this = other; + } + OffByOneScalar& operator=(const OffByOneScalar& other) { + val_ = other.val_; + return *this; + } + + OffByOneScalar(T val) : val_(val + 1) {} + OffByOneScalar& operator=(T val) { + val_ = val + 1; + } + + operator T() const { + return val_ - 1; + } + + private: + T val_; +}; diff --git a/libs/eigen/test/accelerate_support.cpp b/libs/eigen/test/accelerate_support.cpp new file mode 100644 index 0000000..ac4be61 --- /dev/null +++ b/libs/eigen/test/accelerate_support.cpp @@ -0,0 +1,176 @@ +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS +#include "sparse_solver.h" + +#if defined(DEBUG) +#undef DEBUG +#endif + +#include + +template +int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows = 300, int maxCols = 300) +{ + typedef typename MatrixType::Scalar Scalar; + int rows = internal::random(1, maxRows); + int cols = internal::random(1, maxCols); + double density = (std::max)(8.0 / (rows * cols), 0.01); + + A.resize(rows,cols); + dA.resize(rows,cols); + initSparse(density, dA, A, ForceNonZeroDiag); + A.makeCompressed(); + return rows; +} + +template +int generate_sparse_square_symmetric_problem(MatrixType& A, DenseMat& dA, int maxSize = 300) +{ + typedef typename MatrixType::Scalar Scalar; + int rows = internal::random(1, maxSize); + int cols = rows; + double density = (std::max)(8.0 / (rows * cols), 0.01); + + A.resize(rows,cols); + dA.resize(rows,cols); + initSparse(density, dA, A, ForceNonZeroDiag); + dA = dA * dA.transpose(); + A = A * A.transpose(); + A.makeCompressed(); + return rows; +} + +template void test_accelerate_ldlt() +{ + typedef SparseMatrix MatrixType; + typedef Matrix DenseVector; + + MatrixType A; + Matrix dA; + + generate_sparse_square_symmetric_problem(A, dA); + + DenseVector b = DenseVector::Random(A.rows()); + + Solver solver; + solver.compute(A); + + if (solver.info() != Success) + { + std::cerr << "sparse LDLT factorization failed\n"; + exit(0); + return; + } + + DenseVector x = solver.solve(b); + + if (solver.info() != Success) + { + std::cerr << "sparse LDLT factorization failed\n"; + exit(0); + return; + } + + //Compare with a dense solver + DenseVector refX = dA.ldlt().solve(b); + VERIFY((A * x).isApprox(A * refX, test_precision())); +} + +template void test_accelerate_llt() +{ + typedef SparseMatrix MatrixType; + typedef Matrix DenseVector; + + MatrixType A; + Matrix dA; + + generate_sparse_square_symmetric_problem(A, dA); + + DenseVector b = DenseVector::Random(A.rows()); + + Solver solver; + solver.compute(A); + + if (solver.info() != Success) + { + std::cerr << "sparse LLT factorization failed\n"; + exit(0); + return; + } + + DenseVector x = solver.solve(b); + + if (solver.info() != Success) + { + std::cerr << "sparse LLT factorization failed\n"; + exit(0); + return; + } + + //Compare with a dense solver + DenseVector refX = dA.llt().solve(b); + VERIFY((A * x).isApprox(A * refX, test_precision())); +} + +template void test_accelerate_qr() +{ + typedef SparseMatrix MatrixType; + typedef Matrix DenseVector; + + MatrixType A; + Matrix dA; + + generate_sparse_rectangular_problem(A, dA); + + DenseVector b = DenseVector::Random(A.rows()); + + Solver solver; + solver.compute(A); + + if (solver.info() != Success) + { + std::cerr << "sparse QR factorization failed\n"; + exit(0); + return; + } + + DenseVector x = solver.solve(b); + + if (solver.info() != Success) + { + std::cerr << "sparse QR factorization failed\n"; + exit(0); + return; + } + + //Compare with a dense solver + DenseVector refX = dA.colPivHouseholderQr().solve(b); + VERIFY((A * x).isApprox(A * refX, test_precision())); +} + +template +void run_tests() +{ + typedef SparseMatrix MatrixType; + + test_accelerate_ldlt >(); + test_accelerate_ldlt >(); + test_accelerate_ldlt >(); + test_accelerate_ldlt >(); + + test_accelerate_ldlt >(); + test_accelerate_ldlt >(); + test_accelerate_ldlt >(); + test_accelerate_ldlt >(); + + test_accelerate_llt >(); + + test_accelerate_llt >(); + + test_accelerate_qr >(); +} + +EIGEN_DECLARE_TEST(accelerate_support) +{ + CALL_SUBTEST_1(run_tests()); + CALL_SUBTEST_2(run_tests()); +} diff --git a/libs/eigen/test/adjoint.cpp b/libs/eigen/test/adjoint.cpp index 4c4f98b..37d23b1 100644 --- a/libs/eigen/test/adjoint.cpp +++ b/libs/eigen/test/adjoint.cpp @@ -7,8 +7,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#define EIGEN_NO_STATIC_ASSERT - #include "main.h" template struct adjoint_specific; @@ -47,7 +45,7 @@ template<> struct adjoint_specific { VERIFY_IS_APPROX((v1*0).normalized(), (v1*0)); #if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE) RealScalar very_small = (std::numeric_limits::min)(); - VERIFY( (v1*very_small).norm() == 0 ); + VERIFY( numext::is_exactly_zero((v1*very_small).norm()) ); VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small)); v3 = v1*very_small; v3.normalize(); @@ -64,6 +62,17 @@ template<> struct adjoint_specific { } }; +template +MatrixType RandomMatrix(Index rows, Index cols, Scalar min, Scalar max) { + MatrixType M = MatrixType(rows, cols); + for (Index i=0; i(min, max); + } + } + return M; +} + template void adjoint(const MatrixType& m) { /* this test covers the following files: @@ -79,17 +88,21 @@ template void adjoint(const MatrixType& m) Index rows = m.rows(); Index cols = m.cols(); - MatrixType m1 = MatrixType::Random(rows, cols), - m2 = MatrixType::Random(rows, cols), + // Avoid integer overflow by limiting input values. + RealScalar rmin = static_cast(NumTraits::IsInteger ? NumTraits::IsSigned ? -100 : 0 : -1); + RealScalar rmax = static_cast(NumTraits::IsInteger ? 100 : 1); + + MatrixType m1 = RandomMatrix(rows, cols, rmin, rmax), + m2 = RandomMatrix(rows, cols, rmin, rmax), m3(rows, cols), - square = SquareMatrixType::Random(rows, rows); - VectorType v1 = VectorType::Random(rows), - v2 = VectorType::Random(rows), - v3 = VectorType::Random(rows), + square = RandomMatrix(rows, rows, rmin, rmax); + VectorType v1 = RandomMatrix(rows, 1, rmin, rmax), + v2 = RandomMatrix(rows, 1, rmin, rmax), + v3 = RandomMatrix(rows, 1, rmin, rmax), vzero = VectorType::Zero(rows); - Scalar s1 = internal::random(), - s2 = internal::random(); + Scalar s1 = internal::random(rmin, rmax), + s2 = internal::random(rmin, rmax); // check basic compatibility of adjoint, transpose, conjugate VERIFY_IS_APPROX(m1.transpose().conjugate().adjoint(), m1); @@ -140,7 +153,8 @@ template void adjoint(const MatrixType& m) // check mixed dot product typedef Matrix RealVectorType; - RealVectorType rv1 = RealVectorType::Random(rows); + RealVectorType rv1 = RandomMatrix(rows, 1, rmin, rmax); + VERIFY_IS_APPROX(v1.dot(rv1.template cast()), v1.dot(rv1)); VERIFY_IS_APPROX(rv1.template cast().dot(v1), rv1.dot(v1)); diff --git a/libs/eigen/test/array_cwise.cpp b/libs/eigen/test/array_cwise.cpp index 0cc438b..af8a1ef 100644 --- a/libs/eigen/test/array_cwise.cpp +++ b/libs/eigen/test/array_cwise.cpp @@ -7,12 +7,22 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#include #include "main.h" +template ::IsInteger,int> = 0> +std::vector special_values() { + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar two = Scalar(2); + const Scalar three = Scalar(3); + const Scalar min = (std::numeric_limits::min)(); + const Scalar max = (std::numeric_limits::max)(); + return { zero, min, one, two, three, max }; +} -// Test the corner cases of pow(x, y) for real types. -template -void pow_test() { +template ::IsInteger, int> = 0> +std::vector special_values() { const Scalar zero = Scalar(0); const Scalar eps = Eigen::NumTraits::epsilon(); const Scalar one = Scalar(1); @@ -26,36 +36,29 @@ void pow_test() { const Scalar min = (std::numeric_limits::min)(); const Scalar max = (std::numeric_limits::max)(); const Scalar max_exp = (static_cast(int(Eigen::NumTraits::max_exponent())) * Scalar(EIGEN_LN2)) / eps; + return { zero, denorm_min, min, eps, sqrt_half, one, sqrt2, two, three, max_exp, max, inf, nan }; +} - const static Scalar abs_vals[] = {zero, - denorm_min, - min, - eps, - sqrt_half, - one, - sqrt2, - two, - three, - max_exp, - max, - inf, - nan}; - const int abs_cases = 13; - const int num_cases = 2*abs_cases * 2*abs_cases; - // Repeat the same value to make sure we hit the vectorized path. - const int num_repeats = 32; - Array x(num_repeats, num_cases); - Array y(num_repeats, num_cases); +template +void special_value_pairs(Array& x, + Array& y) { + std::vector abs_vals = special_values(); + const Index abs_cases = (Index)abs_vals.size(); + const Index num_cases = 2*abs_cases * 2*abs_cases; + // ensure both vectorized and non-vectorized paths taken + const Index num_repeats = 2 * (Index)internal::packet_traits::size + 1; + x.resize(num_repeats, num_cases); + y.resize(num_repeats, num_cases); int count = 0; - for (int i = 0; i < abs_cases; ++i) { + for (Index i = 0; i < abs_cases; ++i) { const Scalar abs_x = abs_vals[i]; - for (int sign_x = 0; sign_x < 2; ++sign_x) { + for (Index sign_x = 0; sign_x < 2; ++sign_x) { Scalar x_case = sign_x == 0 ? -abs_x : abs_x; - for (int j = 0; j < abs_cases; ++j) { + for (Index j = 0; j < abs_cases; ++j) { const Scalar abs_y = abs_vals[j]; - for (int sign_y = 0; sign_y < 2; ++sign_y) { + for (Index sign_y = 0; sign_y < 2; ++sign_y) { Scalar y_case = sign_y == 0 ? -abs_y : abs_y; - for (int repeat = 0; repeat < num_repeats; ++repeat) { + for (Index repeat = 0; repeat < num_repeats; ++repeat) { x(repeat, count) = x_case; y(repeat, count) = y_case; } @@ -64,24 +67,266 @@ void pow_test() { } } } +} - Array actual = x.pow(y); +template +void binary_op_test(std::string name, Fn fun, RefFn ref) { const Scalar tol = test_precision(); + Array x; + Array y; + special_value_pairs(x, y); + + Array actual = fun(x, y); bool all_pass = true; - for (int i = 0; i < 1; ++i) { - for (int j = 0; j < num_cases; ++j) { - Scalar e = static_cast(std::pow(x(i,j), y(i,j))); + for (Index i = 0; i < x.rows(); ++i) { + for (Index j = 0; j < x.cols(); ++j) { + Scalar e = static_cast(ref(x(i,j), y(i,j))); Scalar a = actual(i, j); - bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e)); - all_pass &= !fail; - if (fail) { - std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl; + bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e)); + all_pass &= success; + if (!success) { + std::cout << name << "(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl; } } } VERIFY(all_pass); } +template +void binary_ops_test() { + binary_op_test("pow", + [](auto x, auto y) { return Eigen::pow(x, y); }, + [](auto x, auto y) { return std::pow(x, y); }); + binary_op_test("atan2", + [](auto x, auto y) { return Eigen::atan2(x, y); }, + [](auto x, auto y) { return std::atan2(x, y); }); +} + +template +void pow_scalar_exponent_test() { + using Int_t = typename internal::make_integer::type; + const Scalar tol = test_precision(); + + std::vector abs_vals = special_values(); + const Index num_vals = (Index)abs_vals.size(); + Map> bases(abs_vals.data(), num_vals); + + bool all_pass = true; + for (Scalar abs_exponent : abs_vals) { + for (Scalar exponent : {-abs_exponent, abs_exponent}) { + // test integer exponent code path + bool exponent_is_integer = (numext::isfinite)(exponent) && (numext::round(exponent) == exponent) && + (numext::abs(exponent) < static_cast(NumTraits::highest())); + if (exponent_is_integer) { + Int_t exponent_as_int = static_cast(exponent); + Array eigenPow = bases.pow(exponent_as_int); + for (Index j = 0; j < num_vals; j++) { + Scalar e = static_cast(std::pow(bases(j), exponent)); + Scalar a = eigenPow(j); + bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || + ((numext::isnan)(a) && (numext::isnan)(e)); + all_pass &= success; + if (!success) { + std::cout << "pow(" << bases(j) << "," << exponent << ") = " << a << " != " << e << std::endl; + } + } + } else { + // test floating point exponent code path + Array eigenPow = bases.pow(exponent); + for (Index j = 0; j < num_vals; j++) { + Scalar e = static_cast(std::pow(bases(j), exponent)); + Scalar a = eigenPow(j); + bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || + ((numext::isnan)(a) && (numext::isnan)(e)); + all_pass &= success; + if (!success) { + std::cout << "pow(" << bases(j) << "," << exponent << ") = " << a << " != " << e << std::endl; + } + } + } + } + } + VERIFY(all_pass); +} + +template +Scalar calc_overflow_threshold(const ScalarExponent exponent) { + EIGEN_USING_STD(exp2); + EIGEN_USING_STD(log2); + EIGEN_STATIC_ASSERT((NumTraits::digits() < 2 * NumTraits::digits()), BASE_TYPE_IS_TOO_BIG); + + if (exponent < 2) + return NumTraits::highest(); + else { + // base^e <= highest ==> base <= 2^(log2(highest)/e) + // For floating-point types, consider the bound for integer values that can be reproduced exactly = 2 ^ digits + double highest_bits = numext::mini(static_cast(NumTraits::digits()), + static_cast(log2(NumTraits::highest()))); + return static_cast( + numext::floor(exp2(highest_bits / static_cast(exponent)))); + } +} + +template ::IsInteger> +struct ref_pow { + static Base run(Base base, Exponent exponent) { + EIGEN_USING_STD(pow); + return pow(base, static_cast(exponent)); + } +}; + +template +struct ref_pow { + static Base run(Base base, Exponent exponent) { + EIGEN_USING_STD(pow); + return pow(base, exponent); + } +}; + +template +void test_exponent(Exponent exponent) { + const Base max_abs_bases = static_cast(10000); + // avoid integer overflow in Base type + Base threshold = calc_overflow_threshold(numext::abs(exponent)); + // avoid numbers that can't be verified with std::pow + double double_threshold = calc_overflow_threshold(numext::abs(exponent)); + // use the lesser of these two thresholds + Base testing_threshold = + static_cast(threshold) < double_threshold ? threshold : static_cast(double_threshold); + // test both vectorized and non-vectorized code paths + const Index array_size = 2 * internal::packet_traits::size + 1; + + Base max_base = numext::mini(testing_threshold, max_abs_bases); + Base min_base = NumTraits::IsSigned ? -max_base : Base(0); + + ArrayX x(array_size), y(array_size); + bool all_pass = true; + for (Base base = min_base; base <= max_base; base++) { + if (exponent < 0 && base == 0) continue; + x.setConstant(base); + y = x.pow(exponent); + for (Base a : y) { + Base e = ref_pow::run(base, exponent); + bool pass = (a == e); + if (!NumTraits::IsInteger) { + pass = pass || (((numext::isfinite)(e) && internal::isApprox(a, e)) || + ((numext::isnan)(a) && (numext::isnan)(e))); + } + all_pass &= pass; + if (!pass) { + std::cout << "pow(" << base << "," << exponent << ") = " << a << " != " << e << std::endl; + } + } + } + VERIFY(all_pass); +} + +template +void unary_pow_test() { + Exponent max_exponent = static_cast(NumTraits::digits()); + Exponent min_exponent = static_cast(NumTraits::IsSigned ? -max_exponent : 0); + + for (Exponent exponent = min_exponent; exponent < max_exponent; ++exponent) { + test_exponent(exponent); + } +} + +void mixed_pow_test() { + // The following cases will test promoting a smaller exponent type + // to a wider base type. + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + + // Although in the following cases the exponent cannot be represented exactly + // in the base type, we do not perform a conversion, but implement + // the operation using repeated squaring. + unary_pow_test(); + unary_pow_test(); + + // The following cases will test promoting a wider exponent type + // to a narrower base type. This should compile but generate a + // deprecation warning: + unary_pow_test(); +} + +void int_pow_test() { + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + + // Although in the following cases the exponent cannot be represented exactly + // in the base type, we do not perform a conversion, but implement the + // operation using repeated squaring. + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); + unary_pow_test(); +} + +namespace Eigen { +namespace internal { +template +struct test_signbit_op { + Scalar constexpr operator()(const Scalar& a) const { return numext::signbit(a); } + template + inline Packet packetOp(const Packet& a) const { + return psignbit(a); + } +}; +template +struct functor_traits> { + enum { Cost = 1, PacketAccess = true }; //todo: define HasSignbit flag +}; +} // namespace internal +} // namespace Eigen + + +template +void signbit_test() { + const size_t size = 100 * internal::packet_traits::size; + ArrayX x(size), y(size); + x.setRandom(); + std::vector special_vals = special_values(); + for (size_t i = 0; i < special_vals.size(); i++) { + x(2 * i + 0) = special_vals[i]; + x(2 * i + 1) = -special_vals[i]; + } + y = x.unaryExpr(internal::test_signbit_op()); + + bool all_pass = true; + for (size_t i = 0; i < size; i++) { + const Scalar ref_val = numext::signbit(x(i)); + bool not_same = internal::predux_any(internal::bitwise_helper::bitwise_xor(ref_val, y(i))); + if (not_same) std::cout << "signbit(" << x(i) << ") != " << y(i) << "\n"; + all_pass = all_pass && !not_same; + } + + VERIFY(all_pass); +} +void signbit_tests() { + signbit_test(); + signbit_test(); + signbit_test(); + signbit_test(); + + signbit_test(); + signbit_test(); + signbit_test(); + signbit_test(); + + signbit_test(); + signbit_test(); + signbit_test(); + signbit_test(); +} + template void array(const ArrayType& m) { typedef typename ArrayType::Scalar Scalar; @@ -92,8 +337,20 @@ template void array(const ArrayType& m) Index rows = m.rows(); Index cols = m.cols(); - ArrayType m1 = ArrayType::Random(rows, cols), - m2 = ArrayType::Random(rows, cols), + ArrayType m1 = ArrayType::Random(rows, cols); + if (NumTraits::IsInteger && NumTraits::IsSigned + && !NumTraits::IsComplex) { + // Here we cap the size of the values in m1 such that pow(3)/cube() + // doesn't overflow and result in undefined behavior. Notice that because + // pow(int, int) promotes its inputs and output to double (according to + // the C++ standard), we have to make sure that the result fits in 53 bits + // for int64, + RealScalar max_val = + numext::mini(RealScalar(std::cbrt(NumTraits::highest())), + RealScalar(std::cbrt(1LL << 53)))/2; + m1.array() = (m1.abs().array() <= max_val).select(m1, Scalar(max_val)); + } + ArrayType m2 = ArrayType::Random(rows, cols), m3(rows, cols); ArrayType m4 = m1; // copy constructor VERIFY_IS_APPROX(m1, m4); @@ -119,23 +376,23 @@ template void array(const ArrayType& m) VERIFY_IS_APPROX(m3, m1 - s1); // scalar operators via Maps - m3 = m1; - ArrayType::Map(m1.data(), m1.rows(), m1.cols()) -= ArrayType::Map(m2.data(), m2.rows(), m2.cols()); - VERIFY_IS_APPROX(m1, m3 - m2); + m3 = m1; m4 = m1; + ArrayType::Map(m4.data(), m4.rows(), m4.cols()) -= ArrayType::Map(m2.data(), m2.rows(), m2.cols()); + VERIFY_IS_APPROX(m4, m3 - m2); - m3 = m1; - ArrayType::Map(m1.data(), m1.rows(), m1.cols()) += ArrayType::Map(m2.data(), m2.rows(), m2.cols()); - VERIFY_IS_APPROX(m1, m3 + m2); + m3 = m1; m4 = m1; + ArrayType::Map(m4.data(), m4.rows(), m4.cols()) += ArrayType::Map(m2.data(), m2.rows(), m2.cols()); + VERIFY_IS_APPROX(m4, m3 + m2); - m3 = m1; - ArrayType::Map(m1.data(), m1.rows(), m1.cols()) *= ArrayType::Map(m2.data(), m2.rows(), m2.cols()); - VERIFY_IS_APPROX(m1, m3 * m2); + m3 = m1; m4 = m1; + ArrayType::Map(m4.data(), m4.rows(), m4.cols()) *= ArrayType::Map(m2.data(), m2.rows(), m2.cols()); + VERIFY_IS_APPROX(m4, m3 * m2); - m3 = m1; + m3 = m1; m4 = m1; m2 = ArrayType::Random(rows,cols); m2 = (m2==0).select(1,m2); - ArrayType::Map(m1.data(), m1.rows(), m1.cols()) /= ArrayType::Map(m2.data(), m2.rows(), m2.cols()); - VERIFY_IS_APPROX(m1, m3 / m2); + ArrayType::Map(m4.data(), m4.rows(), m4.cols()) /= ArrayType::Map(m2.data(), m2.rows(), m2.cols()); + VERIFY_IS_APPROX(m4, m3 / m2); // reductions VERIFY_IS_APPROX(m1.abs().colwise().sum().sum(), m1.abs().sum()); @@ -176,7 +433,6 @@ template void array(const ArrayType& m) FixedArrayType f4(f1.data()); VERIFY_IS_APPROX(f4, f1); } - #if EIGEN_HAS_CXX11 { FixedArrayType f1{s1}; VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1)); @@ -188,7 +444,6 @@ template void array(const ArrayType& m) FixedArrayType f4{f1.data()}; VERIFY_IS_APPROX(f4, f1); } - #endif // pow VERIFY_IS_APPROX(m1.pow(2), m1.square()); @@ -214,14 +469,12 @@ template void array(const ArrayType& m) OneDArrayType o2(static_cast(rows)); VERIFY(o2.size()==rows); } - #if EIGEN_HAS_CXX11 { OneDArrayType o1{rows}; VERIFY(o1.size()==rows); OneDArrayType o4{int(rows)}; VERIFY(o4.size()==rows); } - #endif // Check possible conflicts with 2D ctor typedef Array TwoDArrayType; typedef Array ArrayType2; @@ -238,7 +491,6 @@ template void array(const ArrayType& m) ArrayType2 o4(static_cast(rows),static_cast(cols)); VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols)); } - #if EIGEN_HAS_CXX11 { TwoDArrayType o1{rows,cols}; VERIFY(o1.rows()==rows); @@ -252,7 +504,6 @@ template void array(const ArrayType& m) ArrayType2 o4{int(rows),int(cols)}; VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols)); } - #endif } template void comparisons(const ArrayType& m) @@ -360,11 +611,11 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.sinh(), sinh(m1)); VERIFY_IS_APPROX(m1.cosh(), cosh(m1)); VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); -#if EIGEN_HAS_CXX11_MATH + VERIFY_IS_APPROX(m1.atan2(m2), atan2(m1,m2)); + VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1))); VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1))); VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1))); -#endif VERIFY_IS_APPROX(m1.logistic(), logistic(m1)); VERIFY_IS_APPROX(m1.arg(), arg(m1)); @@ -421,6 +672,13 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() ); VERIFY_IS_APPROX( m1*m1.sign(),m1.abs()); VERIFY_IS_APPROX(m1.sign() * m1.abs(), m1); + + ArrayType tmp = m1.atan2(m2); + for (Index i = 0; i < tmp.size(); ++i) { + Scalar actual = tmp.array()(i); + Scalar expected = atan2(m1.array()(i), m2.array()(i)); + VERIFY_IS_APPROX(actual, expected); + } VERIFY_IS_APPROX(numext::abs2(numext::real(m1)) + numext::abs2(numext::imag(m1)), numext::abs2(m1)); VERIFY_IS_APPROX(numext::abs2(Eigen::real(m1)) + numext::abs2(Eigen::imag(m1)), numext::abs2(m1)); @@ -448,7 +706,10 @@ template void array_real(const ArrayType& m) // Avoid inf and NaN. m3 = (m1.square()::epsilon()).select(Scalar(1),m3); VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse()); - pow_test(); + + // Test pow and atan2 on special IEEE values. + binary_ops_test(); + pow_scalar_exponent_test(); VERIFY_IS_APPROX(log10(m3), log(m3)/numext::log(Scalar(10))); VERIFY_IS_APPROX(log2(m3), log(m3)/numext::log(Scalar(2))); @@ -457,7 +718,7 @@ template void array_real(const ArrayType& m) const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); m1 += ArrayType::Constant(rows,cols,Scalar(tiny)); - VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse()); + VERIFY_IS_CWISE_APPROX(s1/m1, s1 * m1.inverse()); // check inplace transpose m3 = m1; @@ -467,6 +728,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m3, m1); } + template void array_complex(const ArrayType& m) { typedef typename ArrayType::Scalar Scalar; @@ -512,7 +774,6 @@ template void array_complex(const ArrayType& m) VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval())); VERIFY_IS_APPROX(m1.sign(), sign(m1)); - VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2)); VERIFY_IS_APPROX(m1.exp(), exp(m1)); VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp()); @@ -661,6 +922,35 @@ template void array_integer(const ArrayType& m) VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<9>())).all() ); } +template +struct signed_shift_test_impl { + typedef typename ArrayType::Scalar Scalar; + static constexpr size_t Size = sizeof(Scalar); + static constexpr size_t MaxShift = (CHAR_BIT * Size) - 1; + + template + static inline std::enable_if_t<(N > MaxShift), void> run(const ArrayType& ) {} + template + static inline std::enable_if_t<(N <= MaxShift), void> run(const ArrayType& m) { + const Index rows = m.rows(); + const Index cols = m.cols(); + + ArrayType m1 = ArrayType::Random(rows, cols), m2(rows, cols); + + m2 = m1.unaryExpr([](const Scalar& x) { return x >> N; }); + VERIFY((m2 == m1.unaryExpr(internal::scalar_shift_right_op())).all()); + + m2 = m1.unaryExpr([](const Scalar& x) { return x << N; }); + VERIFY((m2 == m1.unaryExpr( internal::scalar_shift_left_op())).all()); + + run(m); + } +}; +template +void signed_shift_test(const ArrayType& m) { + signed_shift_test_impl::run(m); +} + EIGEN_DECLARE_TEST(array_cwise) { for(int i = 0; i < g_repeat; i++) { @@ -673,6 +963,9 @@ EIGEN_DECLARE_TEST(array_cwise) CALL_SUBTEST_6( array(Array(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( array_integer(ArrayXXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( array_integer(Array(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_7( signed_shift_test(ArrayXXi(internal::random(1, EIGEN_TEST_MAX_SIZE), internal::random(1, EIGEN_TEST_MAX_SIZE)))); + CALL_SUBTEST_7( signed_shift_test(Array(internal::random(1, EIGEN_TEST_MAX_SIZE), internal::random(1, EIGEN_TEST_MAX_SIZE)))); + } for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( comparisons(Array()) ); @@ -700,6 +993,12 @@ EIGEN_DECLARE_TEST(array_cwise) CALL_SUBTEST_4( array_complex(ArrayXXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_6( int_pow_test() ); + CALL_SUBTEST_7( mixed_pow_test() ); + CALL_SUBTEST_8( signbit_tests() ); + } + VERIFY((internal::is_same< internal::global_math_functions_filtering_base::type, int >::value)); VERIFY((internal::is_same< internal::global_math_functions_filtering_base::type, float >::value)); VERIFY((internal::is_same< internal::global_math_functions_filtering_base::type, ArrayBase >::value)); diff --git a/libs/eigen/test/array_for_matrix.cpp b/libs/eigen/test/array_for_matrix.cpp index fb6be35..06e04a2 100644 --- a/libs/eigen/test/array_for_matrix.cpp +++ b/libs/eigen/test/array_for_matrix.cpp @@ -211,6 +211,40 @@ template void cwise_min_max(const MatrixType& m) VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, maxM1).array(), (m1.array().max)( maxM1)); VERIFY_IS_APPROX(m1.array(), (m1.array().max)( minM1)); + // Test NaN propagation for min/max. + if (!NumTraits::IsInteger) { + m1(0,0) = NumTraits::quiet_NaN(); + // Elementwise. + VERIFY((numext::isnan)(m1.template cwiseMax(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMax(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMax(Scalar(1))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMin(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMax(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMin(Scalar(1))(0,0))); + + + VERIFY((numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY((numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY(!(numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY(!(numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY((numext::isnan)(m1.array().template max(Scalar(1))(0,0))); + VERIFY((numext::isnan)(m1.array().template min(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.array().template max(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.array().template min(Scalar(1))(0,0))); + + // Reductions. + VERIFY((numext::isnan)(m1.template maxCoeff())); + VERIFY((numext::isnan)(m1.template minCoeff())); + if (m1.size() > 1) { + VERIFY(!(numext::isnan)(m1.template maxCoeff())); + VERIFY(!(numext::isnan)(m1.template minCoeff())); + } else { + VERIFY((numext::isnan)(m1.template maxCoeff())); + VERIFY((numext::isnan)(m1.template minCoeff())); + } + } } template void resize(const MatrixTraits& t) diff --git a/libs/eigen/test/basicstuff.cpp b/libs/eigen/test/basicstuff.cpp index 4ca607c..47dfc04 100644 --- a/libs/eigen/test/basicstuff.cpp +++ b/libs/eigen/test/basicstuff.cpp @@ -7,11 +7,20 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#define EIGEN_NO_STATIC_ASSERT - #include "main.h" #include "random_without_cast_overflow.h" +template +std::enable_if_t<(MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1),void> +check_index(const MatrixType& m) { + VERIFY_RAISES_ASSERT(m[0]); + VERIFY_RAISES_ASSERT((m+m)[0]); +} + +template +std::enable_if_t +check_index(const MatrixType& /*unused*/) {} + template void basicStuff(const MatrixType& m) { typedef typename MatrixType::Scalar Scalar; @@ -60,10 +69,8 @@ template void basicStuff(const MatrixType& m) x = v1(static_cast(r1)); x = v1(static_cast(r1)); x = v1(static_cast(r1)); -#if EIGEN_HAS_CXX11 x = v1(static_cast(r1)); x = v1(static_cast(r1)); -#endif VERIFY_IS_APPROX( v1, v1); VERIFY_IS_NOT_APPROX( v1, 2*v1); @@ -101,8 +108,7 @@ template void basicStuff(const MatrixType& m) if(cols!=1 && rows!=1) { - VERIFY_RAISES_ASSERT(m1[0]); - VERIFY_RAISES_ASSERT((m1+m1)[0]); + check_index(m1); } VERIFY_IS_APPROX(m3 = m1,m1); @@ -223,10 +229,8 @@ struct casting_test_runner { casting_test::run(); casting_test::run(); casting_test::run(); -#if EIGEN_HAS_CXX11 casting_test::run(); casting_test::run(); -#endif casting_test::run(); casting_test::run(); casting_test::run(); @@ -237,7 +241,7 @@ struct casting_test_runner { }; template -struct casting_test_runner::IsComplex)>::type> +struct casting_test_runner::IsComplex)>> { static void run() { // Only a few casts from std::complex are defined. @@ -256,10 +260,8 @@ void casting_all() { casting_test_runner::run(); casting_test_runner::run(); casting_test_runner::run(); -#if EIGEN_HAS_CXX11 casting_test_runner::run(); casting_test_runner::run(); -#endif casting_test_runner::run(); casting_test_runner::run(); casting_test_runner::run(); diff --git a/libs/eigen/test/bdcsvd.cpp b/libs/eigen/test/bdcsvd.cpp index e92a7dc..539494b 100644 --- a/libs/eigen/test/bdcsvd.cpp +++ b/libs/eigen/test/bdcsvd.cpp @@ -10,35 +10,27 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/ +// We explicitly disable deprecated declarations for this set of tests +// because we purposely verify assertions for the deprecated SVD runtime +// option behavior. +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#elif defined(_MSC_VER) +#pragma warning( disable : 4996 ) +#endif + // discard stack allocation as that too bypasses malloc #define EIGEN_STACK_ALLOCATION_LIMIT 0 #define EIGEN_RUNTIME_NO_MALLOC #include "main.h" #include -#include -#include - #define SVD_DEFAULT(M) BDCSVD #define SVD_FOR_MIN_NORM(M) BDCSVD +#define SVD_STATIC_OPTIONS(M, O) BDCSVD #include "svd_common.h" -// Check all variants of JacobiSVD -template -void bdcsvd(const MatrixType& a = MatrixType(), bool pickrandom = true) -{ - MatrixType m; - if(pickrandom) { - m.resizeLike(a); - svd_fill_random(m); - } - else - m = a; - - CALL_SUBTEST(( svd_test_all_computation_options >(m, false) )); -} - template void bdcsvd_method() { @@ -49,70 +41,141 @@ void bdcsvd_method() VERIFY_IS_APPROX(m.bdcSvd().singularValues(), RealVecType::Ones()); VERIFY_RAISES_ASSERT(m.bdcSvd().matrixU()); VERIFY_RAISES_ASSERT(m.bdcSvd().matrixV()); + + // Deprecated behavior. VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).solve(m), m); VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m); VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m); + VERIFY_IS_APPROX(m.template bdcSvd(ComputeFullU|ComputeFullV).solve(m), m); + VERIFY_IS_APPROX(m.template bdcSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m); + VERIFY_IS_APPROX(m.template bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m); + + VERIFY_IS_APPROX(m.template bdcSvd().solve(m), m); + VERIFY_IS_APPROX(m.template bdcSvd().transpose().solve(m), m); + VERIFY_IS_APPROX(m.template bdcSvd().adjoint().solve(m), m); + + VERIFY_IS_APPROX(m.template bdcSvd().solve(m), m); + VERIFY_IS_APPROX(m.template bdcSvd().transpose().solve(m), m); + VERIFY_IS_APPROX(m.template bdcSvd().adjoint().solve(m), m); } // compare the Singular values returned with Jacobi and Bdc -template -void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0) -{ - MatrixType m = MatrixType::Random(a.rows(), a.cols()); - BDCSVD bdc_svd(m); +template +void compare_bdc_jacobi(const MatrixType& a = MatrixType(), int algoswap = 16, bool random = true) { + MatrixType m = random ? MatrixType::Random(a.rows(), a.cols()) : a; + + BDCSVD bdc_svd(m.rows(), m.cols()); + bdc_svd.setSwitchSize(algoswap); + bdc_svd.compute(m); + JacobiSVD jacobi_svd(m); VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues()); - if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU()); - if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU()); - if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); - if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); +} + +// Verifies total deflation is **not** triggered. +void compare_bdc_jacobi_instance(bool structure_as_m, int algoswap = 16) +{ + MatrixXd m(4, 3); + if (structure_as_m) { + // The first 3 rows are the reduced form of Matrix 1 as shown below, and it + // has nonzero elements in the first column and diagonals only. + m << 1.056293, 0, 0, + -0.336468, 0.907359, 0, + -1.566245, 0, 0.149150, + -0.1, 0, 0; + } else { + // Matrix 1. + m << 0.882336, 18.3914, -26.7921, + -5.58135, 17.1931, -24.0892, + -20.794, 8.68496, -4.83103, + -8.4981, -10.5451, 23.9072; + } + compare_bdc_jacobi(m, algoswap, false); +} + +template +void bdcsvd_all_options(const MatrixType& input = MatrixType()) { + MatrixType m(input.rows(), input.cols()); + svd_fill_random(m); + svd_option_checks(m); +} + +template +void bdcsvd_verify_assert(const MatrixType& input = MatrixType()) { + svd_verify_assert(input); + svd_verify_constructor_options_assert>(input); } EIGEN_DECLARE_TEST(bdcsvd) { - CALL_SUBTEST_3(( svd_verify_assert >(Matrix3f()) )); - CALL_SUBTEST_4(( svd_verify_assert >(Matrix4d()) )); - CALL_SUBTEST_7(( svd_verify_assert >(MatrixXf(10,12)) )); - CALL_SUBTEST_8(( svd_verify_assert >(MatrixXcd(7,5)) )); - - CALL_SUBTEST_101(( svd_all_trivial_2x2(bdcsvd) )); - CALL_SUBTEST_102(( svd_all_trivial_2x2(bdcsvd) )); + CALL_SUBTEST_1((bdcsvd_verify_assert())); + CALL_SUBTEST_1((bdcsvd_verify_assert())); + CALL_SUBTEST_2((bdcsvd_verify_assert>())); + CALL_SUBTEST_2((bdcsvd_verify_assert>())); + CALL_SUBTEST_3((bdcsvd_verify_assert, 6, 9>>())); - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_3(( bdcsvd() )); - CALL_SUBTEST_4(( bdcsvd() )); - CALL_SUBTEST_5(( bdcsvd >() )); + CALL_SUBTEST_4((svd_all_trivial_2x2(bdcsvd_all_options))); + CALL_SUBTEST_5((svd_all_trivial_2x2(bdcsvd_all_options))); + for (int i = 0; i < g_repeat; i++) { int r = internal::random(1, EIGEN_TEST_MAX_SIZE/2), c = internal::random(1, EIGEN_TEST_MAX_SIZE/2); - + TEST_SET_BUT_UNUSED_VARIABLE(r) TEST_SET_BUT_UNUSED_VARIABLE(c) - - CALL_SUBTEST_6(( bdcsvd(Matrix(r,2)) )); - CALL_SUBTEST_7(( bdcsvd(MatrixXf(r,c)) )); - CALL_SUBTEST_7(( compare_bdc_jacobi(MatrixXf(r,c)) )); - CALL_SUBTEST_10(( bdcsvd(MatrixXd(r,c)) )); - CALL_SUBTEST_10(( compare_bdc_jacobi(MatrixXd(r,c)) )); - CALL_SUBTEST_8(( bdcsvd(MatrixXcd(r,c)) )); - CALL_SUBTEST_8(( compare_bdc_jacobi(MatrixXcd(r,c)) )); + CALL_SUBTEST_6((compare_bdc_jacobi(MatrixXf(r, c)))); + CALL_SUBTEST_7((compare_bdc_jacobi(MatrixXd(r, c)))); + CALL_SUBTEST_8((compare_bdc_jacobi(MatrixXcd(r, c)))); // Test on inf/nan matrix - CALL_SUBTEST_7( (svd_inf_nan, MatrixXf>()) ); - CALL_SUBTEST_10( (svd_inf_nan, MatrixXd>()) ); + CALL_SUBTEST_9((svd_inf_nan())); + CALL_SUBTEST_10((svd_inf_nan())); + + // Verify some computations using all combinations of the Options template parameter. + CALL_SUBTEST_11((bdcsvd_all_options())); + CALL_SUBTEST_12((bdcsvd_all_options>())); + CALL_SUBTEST_13((bdcsvd_all_options(MatrixXd(20, 17)))); + CALL_SUBTEST_14((bdcsvd_all_options(MatrixXd(17, 20)))); + CALL_SUBTEST_15((bdcsvd_all_options>(Matrix(r, 15)))); + CALL_SUBTEST_16((bdcsvd_all_options>(Matrix(13, c)))); + CALL_SUBTEST_17((bdcsvd_all_options(MatrixXf(r, c)))); + CALL_SUBTEST_18((bdcsvd_all_options(MatrixXcd(r, c)))); + CALL_SUBTEST_19((bdcsvd_all_options(MatrixXd(r, c)))); + CALL_SUBTEST_20((bdcsvd_all_options>(Matrix(20, 27)))); + CALL_SUBTEST_21((bdcsvd_all_options>(Matrix(27, 20)))); + + CALL_SUBTEST_22(( + svd_check_max_size_matrix, ColPivHouseholderQRPreconditioner>( + r, c))); + CALL_SUBTEST_22( + (svd_check_max_size_matrix, HouseholderQRPreconditioner>(r, + c))); + CALL_SUBTEST_22(( + svd_check_max_size_matrix, ColPivHouseholderQRPreconditioner>( + r, c))); + CALL_SUBTEST_22( + (svd_check_max_size_matrix, HouseholderQRPreconditioner>(r, + c))); } // test matrixbase method - CALL_SUBTEST_1(( bdcsvd_method() )); - CALL_SUBTEST_3(( bdcsvd_method() )); + CALL_SUBTEST_23(( bdcsvd_method() )); + CALL_SUBTEST_23(( bdcsvd_method() )); // Test problem size constructors - CALL_SUBTEST_7( BDCSVD(10,10) ); + CALL_SUBTEST_24( BDCSVD(10,10) ); // Check that preallocation avoids subsequent mallocs // Disabled because not supported by BDCSVD // CALL_SUBTEST_9( svd_preallocate() ); - CALL_SUBTEST_2( svd_underoverflow() ); -} + CALL_SUBTEST_25( svd_underoverflow() ); + // Without total deflation issues. + CALL_SUBTEST_26(( compare_bdc_jacobi_instance(true) )); + CALL_SUBTEST_26(( compare_bdc_jacobi_instance(false) )); + + // With total deflation issues before, when it shouldn't be triggered. + CALL_SUBTEST_27(( compare_bdc_jacobi_instance(true, 3) )); + CALL_SUBTEST_27(( compare_bdc_jacobi_instance(false, 3) )); +} diff --git a/libs/eigen/test/bfloat16_float.cpp b/libs/eigen/test/bfloat16_float.cpp index c3de0b1..b2a22ce 100644 --- a/libs/eigen/test/bfloat16_float.cpp +++ b/libs/eigen/test/bfloat16_float.cpp @@ -209,8 +209,8 @@ void test_numtraits() void test_arithmetic() { - VERIFY_IS_EQUAL(static_cast(bfloat16(2) + bfloat16(2)), 4); - VERIFY_IS_EQUAL(static_cast(bfloat16(2) + bfloat16(-2)), 0); + VERIFY_IS_EQUAL(static_cast(bfloat16(2) + bfloat16(2)), 4.f); + VERIFY_IS_EQUAL(static_cast(bfloat16(2) + bfloat16(-2)), 0.f); VERIFY_IS_APPROX(static_cast(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f); VERIFY_IS_EQUAL(static_cast(bfloat16(2.0f) * bfloat16(-5.5f)), -11.0f); VERIFY_IS_APPROX(static_cast(bfloat16(1.0f) / bfloat16(3.0f)), 0.3339f); diff --git a/libs/eigen/test/blasutil.cpp b/libs/eigen/test/blasutil.cpp index 845a498..ee98df4 100644 --- a/libs/eigen/test/blasutil.cpp +++ b/libs/eigen/test/blasutil.cpp @@ -196,12 +196,7 @@ EIGEN_DECLARE_TEST(blasutil) // TODO: Replace this by a call to numext::int64_t as soon as we have a way to // detect the typedef for int64_t on all platforms -#if EIGEN_HAS_CXX11 CALL_SUBTEST_4(run_test()); -#else - CALL_SUBTEST_4(run_test()); -#endif - CALL_SUBTEST_5(run_test()); CALL_SUBTEST_6(run_test()); CALL_SUBTEST_7(run_test >()); diff --git a/libs/eigen/test/block.cpp b/libs/eigen/test/block.cpp index 84124ab..f8583c3 100644 --- a/libs/eigen/test/block.cpp +++ b/libs/eigen/test/block.cpp @@ -7,11 +7,10 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#define EIGEN_NO_STATIC_ASSERT // otherwise we fail at compile time on unused paths #include "main.h" template -typename Eigen::internal::enable_if::IsComplex,typename MatrixType::Scalar>::type +std::enable_if_t::IsComplex,typename MatrixType::Scalar> block_real_only(const MatrixType &m1, Index r1, Index r2, Index c1, Index c2, const Scalar& s1) { // check cwise-Functions: VERIFY_IS_APPROX(m1.row(r1).cwiseMax(s1), m1.cwiseMax(s1).row(r1)); @@ -24,19 +23,33 @@ block_real_only(const MatrixType &m1, Index r1, Index r2, Index c1, Index c2, co } template -typename Eigen::internal::enable_if::IsComplex,typename MatrixType::Scalar>::type +std::enable_if_t::IsComplex,typename MatrixType::Scalar> block_real_only(const MatrixType &, Index, Index, Index, Index, const Scalar&) { return Scalar(0); } // Check at compile-time that T1==T2, and at runtime-time that a==b template -typename internal::enable_if::value,bool>::type +std::enable_if_t::value,bool> is_same_block(const T1& a, const T2& b) { return a.isApprox(b); } +template +std::enable_if_t<((MatrixType::Flags&RowMajorBit)==0),void> +check_left_top(const MatrixType& m, Index r, Index c, + Index rows, Index /*unused*/) { + VERIFY_IS_EQUAL(m.leftCols(c).coeff(r+c*rows), m(r,c)); +} + +template +std::enable_if_t<((MatrixType::Flags&RowMajorBit)!=0),void> +check_left_top(const MatrixType& m, Index r, Index c, + Index /*unused*/, Index cols) { + VERIFY_IS_EQUAL(m.topRows(r).coeff(c+r*cols), m(r,c)); +} + template void block(const MatrixType& m) { typedef typename MatrixType::Scalar Scalar; @@ -79,7 +92,8 @@ template void block(const MatrixType& m) VERIFY_IS_APPROX(m1.col(c1), m1_copy.col(c1) + s1 * m1_copy.col(c2)); m1.col(c1).col(0) += s1 * m1_copy.col(c2); VERIFY_IS_APPROX(m1.col(c1), m1_copy.col(c1) + Scalar(2) * s1 * m1_copy.col(c2)); - + + check_left_top(m1,r1,c1,rows,cols); //check block() Matrix b1(1,1); b1(0,0) = m1(r1,c1); @@ -135,19 +149,14 @@ template void block(const MatrixType& m) } // stress some basic stuffs with block matrices - VERIFY(numext::real(ones.col(c1).sum()) == RealScalar(rows)); - VERIFY(numext::real(ones.row(r1).sum()) == RealScalar(cols)); + VERIFY_IS_EQUAL(numext::real(ones.col(c1).sum()), RealScalar(rows)); + VERIFY_IS_EQUAL(numext::real(ones.row(r1).sum()), RealScalar(cols)); - VERIFY(numext::real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows)); - VERIFY(numext::real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols)); + VERIFY_IS_EQUAL(numext::real(ones.col(c1).dot(ones.col(c2))), RealScalar(rows)); + VERIFY_IS_EQUAL(numext::real(ones.row(r1).dot(ones.row(r2))), RealScalar(cols)); // check that linear acccessors works on blocks m1 = m1_copy; - if((MatrixType::Flags&RowMajorBit)==0) - VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1)); - else - VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1+r1*cols), m1(r1,c1)); - // now test some block-inside-of-block. @@ -213,14 +222,6 @@ template void block(const MatrixType& m) VERIFY_IS_EQUAL( ((m1*1).template block(1,0,0,1)), m1.block(1,0,0,1)); VERIFY_IS_EQUAL( ((m1*1).template block<1,Dynamic>(0,1,1,0)), m1.block(0,1,1,0)); - if (rows>=2 && cols>=2) - { - VERIFY_RAISES_ASSERT( m1 += m1.col(0) ); - VERIFY_RAISES_ASSERT( m1 -= m1.col(0) ); - VERIFY_RAISES_ASSERT( m1.array() *= m1.col(0).array() ); - VERIFY_RAISES_ASSERT( m1.array() /= m1.col(0).array() ); - } - VERIFY_IS_EQUAL( m1.template subVector(r1), m1.row(r1) ); VERIFY_IS_APPROX( (m1+m1).template subVector(r1), (m1+m1).row(r1) ); VERIFY_IS_EQUAL( m1.template subVector(c1), m1.col(c1) ); @@ -240,13 +241,35 @@ template void block(const MatrixType& m) } + template -void compare_using_data_and_stride(const MatrixType& m) +std::enable_if_t +compare_using_data_and_stride(const MatrixType& m) { Index rows = m.rows(); Index cols = m.cols(); Index size = m.size(); Index innerStride = m.innerStride(); + Index rowStride = m.rowStride(); + Index colStride = m.colStride(); + const typename MatrixType::Scalar* data = m.data(); + + for(int j=0;j +std::enable_if_t +compare_using_data_and_stride(const MatrixType& m) +{ + Index rows = m.rows(); + Index cols = m.cols(); + Index innerStride = m.innerStride(); Index outerStride = m.outerStride(); Index rowStride = m.rowStride(); Index colStride = m.colStride(); @@ -256,21 +279,11 @@ void compare_using_data_and_stride(const MatrixType& m) for(int i=0;i diff --git a/libs/eigen/test/boostmultiprec.cpp b/libs/eigen/test/boostmultiprec.cpp index 7c79ded..e2fc9a8 100644 --- a/libs/eigen/test/boostmultiprec.cpp +++ b/libs/eigen/test/boostmultiprec.cpp @@ -74,8 +74,7 @@ #include #include -namespace mp = boost::multiprecision; -typedef mp::number, mp::et_on> Real; +typedef boost::multiprecision::number, boost::multiprecision::et_on> Real; namespace Eigen { template<> struct NumTraits : GenericNumTraits { @@ -201,8 +200,8 @@ EIGEN_DECLARE_TEST(boostmultiprec) TEST_SET_BUT_UNUSED_VARIABLE(s) } - CALL_SUBTEST_9(( jacobisvd(Mat(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); - CALL_SUBTEST_10(( bdcsvd(Mat(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); + CALL_SUBTEST_9(( jacobisvd_all_options(Mat(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); + CALL_SUBTEST_10(( bdcsvd_all_options(Mat(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); CALL_SUBTEST_11(( test_simplicial_cholesky_T() )); } diff --git a/libs/eigen/test/constexpr.cpp b/libs/eigen/test/constexpr.cpp new file mode 100644 index 0000000..b8f0b09 --- /dev/null +++ b/libs/eigen/test/constexpr.cpp @@ -0,0 +1,52 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2022 Alex Richardson +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +EIGEN_DECLARE_TEST(constexpr) { + // Clang accepts (some of) this code when using C++14/C++17, but GCC does not like + // the fact that `T array[Size]` inside Eigen::internal::plain_array is not initialized + // until after the constructor returns: + // error: member ‘Eigen::internal::plain_array::array’ must be initialized by mem-initializer in + // ‘constexpr’ constructor +#if EIGEN_COMP_CXXVER >= 20 + constexpr Matrix3i mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + VERIFY_IS_EQUAL(mat.size(), 9); + VERIFY_IS_EQUAL(mat(0, 0), 1); + static_assert(mat.coeff(0,1) == 2); + constexpr Array33i arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + VERIFY_IS_EQUAL(arr(0, 0), 1); + VERIFY_IS_EQUAL(arr.size(), 9); + static_assert(arr.coeff(0,1) == 2); + // Also check dynamic size arrays/matrices with fixed-size storage (currently + // only works if all elements are initialized, since otherwise the compiler + // complains about uninitialized trailing elements. + constexpr Matrix dyn_mat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + VERIFY_IS_EQUAL(dyn_mat.size(), 9); + VERIFY_IS_EQUAL(dyn_mat(0, 0), 1); + static_assert(dyn_mat.coeff(0,1) == 2); + constexpr Array dyn_arr({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + VERIFY_IS_EQUAL(dyn_arr(0, 0), 1); + VERIFY_IS_EQUAL(dyn_arr.size(), 9); + static_assert(dyn_arr.coeff(0,1) == 2); +#endif // EIGEN_COMP_CXXVER >= 20 +} + +// Check that we can use the std::initializer_list constructor for constexpr variables. +#if EIGEN_COMP_CXXVER >= 20 +// EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT() will fail constexpr evaluation unless +// we have std::is_constant_evaluated(). +constexpr Matrix global_mat({{1, 2}, {3, 4}}); + +EIGEN_DECLARE_TEST(constexpr_global) { + VERIFY_IS_EQUAL(global_mat.size(), 4); + VERIFY_IS_EQUAL(global_mat(0, 0), 1); + static_assert(global_mat.coeff(0,0) == 1); +} +#endif // EIGEN_COMP_CXXVER >= 20 diff --git a/libs/eigen/test/dense_storage.cpp b/libs/eigen/test/dense_storage.cpp index 45c2bd7..ec78f01 100644 --- a/libs/eigen/test/dense_storage.cpp +++ b/libs/eigen/test/dense_storage.cpp @@ -13,7 +13,6 @@ #include -#if EIGEN_HAS_TYPE_TRAITS && EIGEN_HAS_CXX11 using DenseStorageD3x3 = Eigen::DenseStorage; static_assert(std::is_trivially_move_constructible::value, "DenseStorage not trivially_move_constructible"); static_assert(std::is_trivially_move_assignable::value, "DenseStorage not trivially_move_assignable"); @@ -22,7 +21,6 @@ static_assert(std::is_trivially_copy_constructible::value, "De static_assert(std::is_trivially_copy_assignable::value, "DenseStorage not trivially_copy_assignable"); static_assert(std::is_trivially_copyable::value, "DenseStorage not trivially_copyable"); #endif -#endif template void dense_storage_copy(int rows, int cols) @@ -90,8 +88,6 @@ void dense_storage_swap(int rows0, int cols0, int rows1, int cols1) template void dense_storage_alignment() { - #if EIGEN_HAS_ALIGNAS - struct alignas(Alignment) Empty1 {}; VERIFY_IS_EQUAL(std::alignment_of::value, Alignment); @@ -104,13 +100,12 @@ void dense_storage_alignment() VERIFY_IS_EQUAL( (std::alignment_of >::value), Alignment); const std::size_t default_alignment = internal::compute_default_alignment::value; - - VERIFY_IS_EQUAL( (std::alignment_of >::value), default_alignment); - VERIFY_IS_EQUAL( (std::alignment_of >::value), default_alignment); - struct Nested2 { Matrix mat; }; - VERIFY_IS_EQUAL(std::alignment_of::value, default_alignment); - - #endif + if (default_alignment > 0) { + VERIFY_IS_EQUAL( (std::alignment_of >::value), default_alignment); + VERIFY_IS_EQUAL( (std::alignment_of >::value), default_alignment); + struct Nested2 { Matrix mat; }; + VERIFY_IS_EQUAL(std::alignment_of::value, default_alignment); + } } template diff --git a/libs/eigen/test/diagonal_matrix_variadic_ctor.cpp b/libs/eigen/test/diagonal_matrix_variadic_ctor.cpp index fbc8f84..db56539 100644 --- a/libs/eigen/test/diagonal_matrix_variadic_ctor.cpp +++ b/libs/eigen/test/diagonal_matrix_variadic_ctor.cpp @@ -7,32 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#define EIGEN_NO_STATIC_ASSERT - #include "main.h" -template -void assertionTest() -{ - typedef DiagonalMatrix DiagMatrix5; - typedef DiagonalMatrix DiagMatrix7; - typedef DiagonalMatrix DiagMatrixX; - - Scalar raw[6]; - for (int i = 0; i < 6; ++i) { - raw[i] = internal::random(); - } - - VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[2], raw[3]})); - VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[3]})); - VERIFY_RAISES_ASSERT((DiagMatrix7{raw[0], raw[1], raw[2], raw[3]})); - - VERIFY_RAISES_ASSERT((DiagMatrixX { - {raw[0], raw[1], raw[2]}, - {raw[3], raw[4], raw[5]} - })); -} - #define VERIFY_IMPLICIT_CONVERSION_3(DIAGTYPE, V0, V1, V2) \ DIAGTYPE d(V0, V1, V2); \ DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix(); \ @@ -167,14 +143,6 @@ void constructorTest() EIGEN_DECLARE_TEST(diagonal_matrix_variadic_ctor) { - CALL_SUBTEST_1(assertionTest()); - CALL_SUBTEST_1(assertionTest()); - CALL_SUBTEST_1(assertionTest()); - CALL_SUBTEST_1(assertionTest()); - CALL_SUBTEST_1(assertionTest()); - CALL_SUBTEST_1(assertionTest()); - CALL_SUBTEST_1(assertionTest>()); - CALL_SUBTEST_2(constructorTest()); CALL_SUBTEST_2(constructorTest()); CALL_SUBTEST_2(constructorTest()); diff --git a/libs/eigen/test/diagonalmatrices.cpp b/libs/eigen/test/diagonalmatrices.cpp index 276bead..15492a7 100644 --- a/libs/eigen/test/diagonalmatrices.cpp +++ b/libs/eigen/test/diagonalmatrices.cpp @@ -7,6 +7,12 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// discard stack allocation as that too bypasses malloc +#define EIGEN_STACK_ALLOCATION_LIMIT 0 +// heap allocation will raise an assert if enabled at runtime +#define EIGEN_RUNTIME_NO_MALLOC + #include "main.h" using namespace std; template void diagonalmatrices(const MatrixType& m) @@ -56,6 +62,7 @@ template void diagonalmatrices(const MatrixType& m) Index i = internal::random(0, rows-1); Index j = internal::random(0, cols-1); + internal::set_is_malloc_allowed(false); VERIFY_IS_APPROX( ((ldm1 * m1)(i,j)) , ldm1.diagonal()(i) * m1(i,j) ); VERIFY_IS_APPROX( ((ldm1 * (m1+m2))(i,j)) , ldm1.diagonal()(i) * (m1+m2)(i,j) ); VERIFY_IS_APPROX( ((m1 * rdm1)(i,j)) , rdm1.diagonal()(j) * m1(i,j) ); @@ -65,6 +72,10 @@ template void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * (m1+m2))(i,j)) , (v1+v2)(i) * (m1+m2)(i,j) ); VERIFY_IS_APPROX( ((m1 * (rv1+rv2).asDiagonal())(i,j)) , (rv1+rv2)(j) * m1(i,j) ); VERIFY_IS_APPROX( (((m1+m2) * (rv1+rv2).asDiagonal())(i,j)) , (rv1+rv2)(j) * (m1+m2)(i,j) ); + VERIFY_IS_APPROX( (ldm1 * ldm1).diagonal()(i), ldm1.diagonal()(i) * ldm1.diagonal()(i) ); + VERIFY_IS_APPROX( (ldm1 * ldm1 * m1)(i, j), ldm1.diagonal()(i) * ldm1.diagonal()(i) * m1(i, j) ); + VERIFY_IS_APPROX( ((v1.asDiagonal() * v1.asDiagonal()).diagonal()(i)), v1(i) * v1(i) ); + internal::set_is_malloc_allowed(true); if(rows>1) { @@ -84,7 +95,15 @@ template void diagonalmatrices(const MatrixType& m) big.block(i,j,rows,cols) = m1; big.block(i,j,rows,cols) = big.block(i,j,rows,cols) * rv1.asDiagonal(); VERIFY_IS_APPROX((big.block(i,j,rows,cols)) , m1 * rv1.asDiagonal() ); - + + // products do not allocate memory + MatrixType res(rows, cols); + internal::set_is_malloc_allowed(false); + res.noalias() = ldm1 * m1; + res.noalias() = m1 * rdm1; + res.noalias() = ldm1 * m1 * rdm1; + res.noalias() = LeftDiagonalMatrix::Identity(rows) * m1 * RightDiagonalMatrix::Zero(cols); + internal::set_is_malloc_allowed(true); // scalar multiple VERIFY_IS_APPROX(LeftDiagonalMatrix(ldm1*s1).diagonal(), ldm1.diagonal() * s1); @@ -112,6 +131,13 @@ template void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() + v2.asDiagonal(), sq_m1 + sq_m2); VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - v2.asDiagonal(), sq_m1 - sq_m2); VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - 2*v2.asDiagonal() + v1.asDiagonal(), sq_m1 - 2*sq_m2 + sq_m1); + + // Zero and Identity + LeftDiagonalMatrix zero = LeftDiagonalMatrix::Zero(rows); + LeftDiagonalMatrix identity = LeftDiagonalMatrix::Identity(rows); + VERIFY_IS_APPROX(identity.diagonal().sum(), Scalar(rows)); + VERIFY_IS_APPROX(zero.diagonal().sum(), Scalar(0)); + VERIFY_IS_APPROX((zero + 2 * LeftDiagonalMatrix::Identity(rows)).diagonal().sum(), Scalar(2 * rows)); } template void as_scalar_product(const MatrixType& m) diff --git a/libs/eigen/test/dynalloc.cpp b/libs/eigen/test/dynalloc.cpp index 23c90a7..cdc10ee 100644 --- a/libs/eigen/test/dynalloc.cpp +++ b/libs/eigen/test/dynalloc.cpp @@ -20,9 +20,12 @@ typedef Matrix Vector8f; void check_handmade_aligned_malloc() { + // Hand-make alignment needs at least sizeof(void*) to store the offset. + constexpr int alignment = (std::max)(EIGEN_DEFAULT_ALIGN_BYTES, sizeof(void*)); + for(int i = 1; i < 1000; i++) { - char *p = (char*)internal::handmade_aligned_malloc(i); + char *p = (char*)internal::handmade_aligned_malloc(i, alignment); VERIFY(internal::UIntPtr(p)%ALIGNMENT==0); // if the buffer is wrongly allocated this will give a bad write --> check with valgrind for(int j = 0; j < i; j++) p[j]=0; diff --git a/libs/eigen/test/eigensolver_generalized_real.cpp b/libs/eigen/test/eigensolver_generalized_real.cpp index 95ed431..a0c99b1 100644 --- a/libs/eigen/test/eigensolver_generalized_real.cpp +++ b/libs/eigen/test/eigensolver_generalized_real.cpp @@ -85,6 +85,42 @@ template void generalized_eigensolver_real(const MatrixType } } +template +void generalized_eigensolver_assert() { + GeneralizedEigenSolver eig; + // all raise assert if uninitialized + VERIFY_RAISES_ASSERT(eig.info()); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + VERIFY_RAISES_ASSERT(eig.eigenvalues()); + VERIFY_RAISES_ASSERT(eig.alphas()); + VERIFY_RAISES_ASSERT(eig.betas()); + + // none raise assert after compute called + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20)); + VERIFY(eig.info() == Success); + eig.eigenvectors(); + eig.eigenvalues(); + eig.alphas(); + eig.betas(); + + // eigenvectors() raises assert, if eigenvectors were not requested + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20), false); + VERIFY(eig.info() == Success); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + eig.eigenvalues(); + eig.alphas(); + eig.betas(); + + // all except info raise assert if realQZ did not converge + eig.setMaxIterations(0); // force real QZ to fail. + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20)); + VERIFY(eig.info() == NoConvergence); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + VERIFY_RAISES_ASSERT(eig.eigenvalues()); + VERIFY_RAISES_ASSERT(eig.alphas()); + VERIFY_RAISES_ASSERT(eig.betas()); +} + EIGEN_DECLARE_TEST(eigensolver_generalized_real) { for(int i = 0; i < g_repeat; i++) { @@ -98,6 +134,7 @@ EIGEN_DECLARE_TEST(eigensolver_generalized_real) CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) ); CALL_SUBTEST_3( generalized_eigensolver_real(Matrix()) ); CALL_SUBTEST_4( generalized_eigensolver_real(Matrix2d()) ); + CALL_SUBTEST_5( generalized_eigensolver_assert() ); TEST_SET_BUT_UNUSED_VARIABLE(s) } } diff --git a/libs/eigen/test/evaluators.cpp b/libs/eigen/test/evaluators.cpp index 2810cd2..95bfb45 100644 --- a/libs/eigen/test/evaluators.cpp +++ b/libs/eigen/test/evaluators.cpp @@ -510,7 +510,9 @@ EIGEN_DECLARE_TEST(evaluators) const size_t K = 2; const size_t N = 5; float *destMem = new float[(M*N) + 1]; - float *dest = (internal::UIntPtr(destMem)%EIGEN_MAX_ALIGN_BYTES) == 0 ? destMem+1 : destMem; + // In case of no alignment, avoid division by zero. + constexpr int alignment = (std::max)(EIGEN_MAX_ALIGN_BYTES, 1); + float *dest = (internal::UIntPtr(destMem)%alignment) == 0 ? destMem+1 : destMem; const Matrix a = Matrix::Random(M, K); const Matrix b = Matrix::Random(K, N); diff --git a/libs/eigen/test/geo_alignedbox.cpp b/libs/eigen/test/geo_alignedbox.cpp index 7b1684f..e4dab32 100644 --- a/libs/eigen/test/geo_alignedbox.cpp +++ b/libs/eigen/test/geo_alignedbox.cpp @@ -211,7 +211,7 @@ MatrixType randomRotationMatrix() // https://www.isprs-ann-photogramm-remote-sens-spatial-inf-sci.net/III-7/103/2016/isprs-annals-III-7-103-2016.pdf const MatrixType rand = MatrixType::Random(); const MatrixType q = rand.householderQr().householderQ(); - const JacobiSVD svd = q.jacobiSvd(ComputeFullU | ComputeFullV); + const JacobiSVD svd(q); const typename MatrixType::Scalar det = (svd.matrixU() * svd.matrixV().transpose()).determinant(); MatrixType diag = rand.Identity(); diag(MatrixType::RowsAtCompileTime - 1, MatrixType::ColsAtCompileTime - 1) = det; diff --git a/libs/eigen/test/geo_eulerangles.cpp b/libs/eigen/test/geo_eulerangles.cpp index 693c627..bea2419 100644 --- a/libs/eigen/test/geo_eulerangles.cpp +++ b/libs/eigen/test/geo_eulerangles.cpp @@ -26,7 +26,7 @@ void verify_euler(const Matrix& ea, int i, int j, int k) VERIFY_IS_APPROX(m, mbis); /* If I==K, and ea[1]==0, then there no unique solution. */ /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ - if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision())) ) + if((i!=k || !numext::is_exactly_zero(ea[1])) && (i == k || !internal::isApprox(abs(ea[1]), Scalar(EIGEN_PI / 2), test_precision())) ) VERIFY((ea-eabis).norm() <= test_precision()); // approx_or_less_than does not work for 0 diff --git a/libs/eigen/test/geo_orthomethods.cpp b/libs/eigen/test/geo_orthomethods.cpp index b7b6607..64b3927 100644 --- a/libs/eigen/test/geo_orthomethods.cpp +++ b/libs/eigen/test/geo_orthomethods.cpp @@ -73,8 +73,39 @@ template void orthomethods_3() // check mixed product typedef Matrix RealVector3; RealVector3 rv1 = RealVector3::Random(); - VERIFY_IS_APPROX(v1.cross(rv1.template cast()), v1.cross(rv1)); - VERIFY_IS_APPROX(rv1.template cast().cross(v1), rv1.cross(v1)); + v2 = rv1.template cast(); + VERIFY_IS_APPROX(v1.cross(v2), v1.cross(rv1)); + VERIFY_IS_APPROX(v2.cross(v1), rv1.cross(v1)); +} + +template void orthomethods_2() +{ + typedef typename NumTraits::Real RealScalar; + typedef Matrix Vector2; + typedef Matrix Vector3; + + Vector3 v30 = Vector3::Random(), + v31 = Vector3::Random(); + Vector2 v20 = v30.template head<2>(); + Vector2 v21 = v31.template head<2>(); + + VERIFY_IS_MUCH_SMALLER_THAN(v20.cross(v20), Scalar(1)); + VERIFY_IS_MUCH_SMALLER_THAN(v21.cross(v21), Scalar(1)); + VERIFY_IS_APPROX(v20.cross(v21), v30.cross(v31).z()); + + Vector2 v20Rot90(numext::conj(-v20.y()), numext::conj(v20.x())); + VERIFY_IS_APPROX(v20.cross( v20Rot90), v20.squaredNorm()); + VERIFY_IS_APPROX(v20.cross(-v20Rot90), -v20.squaredNorm()); + Vector2 v21Rot90(numext::conj(-v21.y()), numext::conj(v21.x())); + VERIFY_IS_APPROX(v21.cross( v21Rot90), v21.squaredNorm()); + VERIFY_IS_APPROX(v21.cross(-v21Rot90), -v21.squaredNorm()); + + // check mixed product + typedef Matrix RealVector2; + RealVector2 rv21 = RealVector2::Random(); + v21 = rv21.template cast(); + VERIFY_IS_APPROX(v20.cross(v21), v20.cross(rv21)); + VERIFY_IS_APPROX(v21.cross(v20), rv21.cross(v20)); } template void orthomethods(int size=Size) @@ -118,6 +149,9 @@ template void orthomethods(int size=Size) EIGEN_DECLARE_TEST(geo_orthomethods) { for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( orthomethods_2() ); + CALL_SUBTEST_2( orthomethods_2() ); + CALL_SUBTEST_4( orthomethods_2 >() ); CALL_SUBTEST_1( orthomethods_3() ); CALL_SUBTEST_2( orthomethods_3() ); CALL_SUBTEST_4( orthomethods_3 >() ); diff --git a/libs/eigen/test/geo_quaternion.cpp b/libs/eigen/test/geo_quaternion.cpp index c561fc8..a821cf2 100644 --- a/libs/eigen/test/geo_quaternion.cpp +++ b/libs/eigen/test/geo_quaternion.cpp @@ -286,15 +286,13 @@ template void check_const_correctness(const PlainObjec // CMake can help with that. // verify that map-to-const don't have LvalueBit - typedef typename internal::add_const::type ConstPlainObjectType; + typedef std::add_const_t ConstPlainObjectType; VERIFY( !(internal::traits >::Flags & LvalueBit) ); VERIFY( !(internal::traits >::Flags & LvalueBit) ); VERIFY( !(Map::Flags & LvalueBit) ); VERIFY( !(Map::Flags & LvalueBit) ); } -#if EIGEN_HAS_RVALUE_REFERENCES - // Regression for bug 1573 struct MovableClass { // The following line is a workaround for gcc 4.7 and 4.8 (see bug 1573 comments). @@ -307,8 +305,6 @@ struct MovableClass { Quaternionf m_quat; }; -#endif - EIGEN_DECLARE_TEST(geo_quaternion) { for(int i = 0; i < g_repeat; i++) { diff --git a/libs/eigen/test/gpu_basic.cu b/libs/eigen/test/gpu_basic.cu index 4298da3..e424a93 100644 --- a/libs/eigen/test/gpu_basic.cu +++ b/libs/eigen/test/gpu_basic.cu @@ -138,10 +138,12 @@ struct complex_operators { out[out_idx++] = a / numext::real(b); out[out_idx++] = numext::real(a) / b; +#if !defined(EIGEN_COMP_MSVC) out[out_idx] = a; out[out_idx++] += b; out[out_idx] = a; out[out_idx++] -= b; out[out_idx] = a; out[out_idx++] *= b; out[out_idx] = a; out[out_idx++] /= b; +#endif const ComplexType true_value = ComplexType(ValueType(1), ValueType(0)); const ComplexType false_value = ComplexType(ValueType(0), ValueType(0)); @@ -188,6 +190,7 @@ struct complex_operators { res.segment(block_idx, size) = x1.real().array() / x2.array(); block_idx += size; +#if !defined(EIGEN_COMP_MSVC) res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2; block_idx += size; res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2; @@ -196,6 +199,7 @@ struct complex_operators { block_idx += size; res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array(); block_idx += size; +#endif const T true_vector = T::Constant(true_value); const T false_vector = T::Constant(false_value); diff --git a/libs/eigen/test/gpu_example.cu b/libs/eigen/test/gpu_example.cu new file mode 100644 index 0000000..a69f5ea --- /dev/null +++ b/libs/eigen/test/gpu_example.cu @@ -0,0 +1,129 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 The Eigen Team. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// The following is an example GPU test. + +#include "main.h" // Include the main test utilities. + +// Define a kernel functor. +// +// The kernel must be a POD type and implement operator(). +struct AddKernel { + // Parameters must be POD or serializable Eigen types (e.g. Matrix, + // Array). The return value must be a POD or serializable value type. + template + EIGEN_DEVICE_FUNC + Type3 operator()(const Type1& A, const Type2& B, Type3& C) const { + C = A + B; // Populate output parameter. + Type3 D = A + B; // Populate return value. + return D; + } +}; + +// Define a sub-test that uses the kernel. +template +void test_add(const T& type) { + const Index rows = type.rows(); + const Index cols = type.cols(); + + // Create random inputs. + const T A = T::Random(rows, cols); + const T B = T::Random(rows, cols); + T C; // Output parameter. + + // Create kernel. + AddKernel add_kernel; + + // Run add_kernel(A, B, C) via run(...). + // This will run on the GPU if using a GPU compiler, or CPU otherwise, + // facilitating generic tests that can run on either. + T D = run(add_kernel, A, B, C); + + // Check that both output parameter and return value are correctly populated. + const T expected = A + B; + VERIFY_IS_CWISE_EQUAL(C, expected); + VERIFY_IS_CWISE_EQUAL(D, expected); + + // In a GPU-only test, we can verify that the CPU and GPU produce the + // same results. + T C_cpu, C_gpu; + T D_cpu = run_on_cpu(add_kernel, A, B, C_cpu); // Runs on CPU. + T D_gpu = run_on_gpu(add_kernel, A, B, C_gpu); // Runs on GPU. + VERIFY_IS_CWISE_EQUAL(C_cpu, C_gpu); + VERIFY_IS_CWISE_EQUAL(D_cpu, D_gpu); +}; + +struct MultiplyKernel { + template + EIGEN_DEVICE_FUNC + Type3 operator()(const Type1& A, const Type2& B, Type3& C) const { + C = A * B; + return A * B; + } +}; + +template +void test_multiply(const T1& type1, const T2& type2, const T3& type3) { + const T1 A = T1::Random(type1.rows(), type1.cols()); + const T2 B = T2::Random(type2.rows(), type2.cols()); + T3 C; + + MultiplyKernel multiply_kernel; + + // The run(...) family of functions uses a memory buffer to transfer data back + // and forth to and from the device. The size of this buffer is estimated + // from the size of all input parameters. If the estimated buffer size is + // not sufficient for transferring outputs from device-to-host, then an + // explicit buffer size needs to be specified. + + // 2 outputs of size (A * B). For each matrix output, the buffer will store + // the number of rows, columns, and the data. + size_t buffer_capacity_hint = 2 * ( // 2 output parameters + 2 * sizeof(typename T3::Index) // # Rows, # Cols + + A.rows() * B.cols() * sizeof(typename T3::Scalar)); // Output data + + T3 D = run_with_hint(buffer_capacity_hint, multiply_kernel, A, B, C); + + const T3 expected = A * B; + VERIFY_IS_CWISE_APPROX(C, expected); + VERIFY_IS_CWISE_APPROX(D, expected); + + T3 C_cpu, C_gpu; + T3 D_cpu = run_on_cpu(multiply_kernel, A, B, C_cpu); + T3 D_gpu = run_on_gpu_with_hint(buffer_capacity_hint, + multiply_kernel, A, B, C_gpu); + VERIFY_IS_CWISE_APPROX(C_cpu, C_gpu); + VERIFY_IS_CWISE_APPROX(D_cpu, D_gpu); +} + +// Declare the test fixture. +EIGEN_DECLARE_TEST(gpu_example) +{ + // For the number of repeats, call the desired subtests. + for(int i = 0; i < g_repeat; i++) { + // Call subtests with different sized/typed inputs. + CALL_SUBTEST( test_add(Eigen::Vector3f()) ); + CALL_SUBTEST( test_add(Eigen::Matrix3d()) ); + CALL_SUBTEST( test_add(Eigen::MatrixX(10, 10)) ); + + CALL_SUBTEST( test_add(Eigen::Array44f()) ); + CALL_SUBTEST( test_add(Eigen::ArrayXd(20)) ); + CALL_SUBTEST( test_add(Eigen::ArrayXXi(13, 17)) ); + + CALL_SUBTEST( test_multiply(Eigen::Matrix3d(), + Eigen::Matrix3d(), + Eigen::Matrix3d()) ); + CALL_SUBTEST( test_multiply(Eigen::MatrixX(10, 10), + Eigen::MatrixX(10, 10), + Eigen::MatrixX()) ); + CALL_SUBTEST( test_multiply(Eigen::MatrixXf(12, 1), + Eigen::MatrixXf(1, 32), + Eigen::MatrixXf()) ); + } +} diff --git a/libs/eigen/test/gpu_test_helper.h b/libs/eigen/test/gpu_test_helper.h new file mode 100644 index 0000000..0942466 --- /dev/null +++ b/libs/eigen/test/gpu_test_helper.h @@ -0,0 +1,476 @@ +#ifndef GPU_TEST_HELPER_H +#define GPU_TEST_HELPER_H + +#include + +#ifdef EIGEN_GPUCC +#define EIGEN_USE_GPU +#include "../unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h" +#endif // EIGEN_GPUCC + +// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that +// doesn't allow std::tuple to compile for host code either. In these cases, +// use our custom implementation. +#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000) +#define EIGEN_USE_CUSTOM_TUPLE 1 +#else +#define EIGEN_USE_CUSTOM_TUPLE 0 +#endif + +#if EIGEN_USE_CUSTOM_TUPLE +#include "../Eigen/src/Core/arch/GPU/Tuple.h" +#else +#include +#endif +namespace Eigen { + +namespace internal { + +// Note: cannot re-use tuple_impl, since that will cause havoc for +// tuple_test. +namespace test_detail { +// Use std::tuple on CPU, otherwise use the GPU-specific versions. +#if !EIGEN_USE_CUSTOM_TUPLE +using std::tuple; +using std::get; +using std::make_tuple; +using std::tie; +#else +using tuple_impl::tuple; +using tuple_impl::get; +using tuple_impl::make_tuple; +using tuple_impl::tie; +#endif +#undef EIGEN_USE_CUSTOM_TUPLE +} // namespace test_detail + +template +struct extract_output_indices_helper; + +/** + * Extracts a set of indices corresponding to non-const l-value reference + * output types. + * + * \internal + * \tparam N the number of types {T1, Ts...}. + * \tparam Idx the "index" to append if T1 is an output type. + * \tparam OutputIndices the current set of output indices. + * \tparam T1 the next type to consider, with index Idx. + * \tparam Ts the remaining types. + */ +template +struct extract_output_indices_helper, T1, Ts...> { + using type = typename + extract_output_indices_helper< + N - 1, Idx + 1, + typename std::conditional< + // If is a non-const l-value reference, append index. + std::is_lvalue_reference::value + && !std::is_const>::value, + std::index_sequence, + std::index_sequence >::type, + Ts...>::type; +}; + +// Base case. +template +struct extract_output_indices_helper<0, Idx, std::index_sequence > { + using type = std::index_sequence; +}; + +// Extracts a set of indices into Types... that correspond to non-const +// l-value references. +template +using extract_output_indices = typename extract_output_indices_helper, Types...>::type; + +// Helper struct for dealing with Generic functors that may return void. +struct void_helper { + struct Void {}; + + // Converts void -> Void, T otherwise. + template + using ReturnType = typename std::conditional::value, Void, T>::type; + + // Non-void return value. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + auto call(Func&& func, Args&&... args) -> + std::enable_if_t::value, + decltype(func(args...))> { + return func(std::forward(args)...); + } + + // Void return value. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + auto call(Func&& func, Args&&... args) -> + std::enable_if_t::value, + Void> { + func(std::forward(args)...); + return Void{}; + } + + // Restores the original return type, Void -> void, T otherwise. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + std::enable_if_t::type, Void>::value, T> + restore(T&& val) { + return val; + } + + // Void case. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + void restore(const Void&) {} +}; + +// Runs a kernel via serialized buffer. Does this by deserializing the buffer +// to construct the arguments, calling the kernel, then re-serialing the outputs. +// The buffer contains +// [ input_buffer_size, args ] +// After the kernel call, it is then populated with +// [ output_buffer_size, output_parameters, return_value ] +// If the output_buffer_size exceeds the buffer's capacity, then only the +// output_buffer_size is populated. +template +EIGEN_DEVICE_FUNC +void run_serialized(std::index_sequence, std::index_sequence, + Kernel kernel, uint8_t* buffer, size_t capacity) { + using test_detail::get; + using test_detail::make_tuple; + using test_detail::tuple; + // Deserialize input size and inputs. + size_t input_size; + const uint8_t* read_ptr = buffer; + const uint8_t* read_end = buffer + capacity; + read_ptr = Eigen::deserialize(read_ptr, read_end, input_size); + // Create value-type instances to populate. + auto args = make_tuple(typename std::decay::type{}...); + EIGEN_UNUSED_VARIABLE(args) // Avoid NVCC compile warning. + // NVCC 9.1 requires us to spell out the template parameters explicitly. + read_ptr = Eigen::deserialize(read_ptr, read_end, get::type...>(args)...); + + // Call function, with void->Void conversion so we are guaranteed a complete + // output type. + auto result = void_helper::call(kernel, get::type...>(args)...); + + // Determine required output size. + size_t output_size = Eigen::serialize_size(capacity); + output_size += Eigen::serialize_size(get::type...>(args)...); + output_size += Eigen::serialize_size(result); + + // Always serialize required buffer size. + uint8_t* write_ptr = buffer; + uint8_t* write_end = buffer + capacity; + write_ptr = Eigen::serialize(write_ptr, write_end, output_size); + // Null `write_ptr` can be safely passed along. + // Serialize outputs if they fit in the buffer. + if (output_size <= capacity) { + // Collect outputs and result. + write_ptr = Eigen::serialize(write_ptr, write_end, get::type...>(args)...); + write_ptr = Eigen::serialize(write_ptr, write_end, result); + } +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) { + run_serialized (std::make_index_sequence{}, + extract_output_indices{}, + kernel, buffer, capacity); +} + +#ifdef EIGEN_GPUCC + +// Checks for GPU errors and asserts / prints the error message. +#define GPU_CHECK(expr) \ +do { \ + gpuError_t err = expr; \ + if (err != gpuSuccess) { \ + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \ + gpu_assert(false); \ + } \ +} while(0) + +// Calls run_serialized on the GPU. +template +__global__ +EIGEN_HIP_LAUNCH_BOUNDS_1024 +void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer, size_t capacity) { + run_serialized(kernel, buffer, capacity); +} + +// Runs kernel(args...) on the GPU via the serialization mechanism. +// +// Note: this may end up calling the kernel multiple times if the initial output +// buffer is not large enough to hold the outputs. +template +auto run_serialized_on_gpu(size_t buffer_capacity_hint, + std::index_sequence, + std::index_sequence, + Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { + // Compute the required serialization buffer capacity. + // Round up input size to next power of two to give a little extra room + // for outputs. + size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...); + + size_t capacity; + if (buffer_capacity_hint == 0) { + // Estimate as the power of two larger than the total input size. + capacity = sizeof(size_t); + while (capacity <= input_data_size) { + capacity *= 2; + } + } else { + // Use the larger of the hint and the total input size. + // Add sizeof(size_t) to the hint to account for storing the buffer capacity + // itself so the user doesn't need to think about this. + capacity = std::max(buffer_capacity_hint + sizeof(size_t), + input_data_size); + } + std::vector buffer(capacity); + + uint8_t* host_data = nullptr; + uint8_t* host_data_end = nullptr; + uint8_t* host_ptr = nullptr; + uint8_t* device_data = nullptr; + size_t output_data_size = 0; + + // Allocate buffers and copy input data. + capacity = std::max(capacity, output_data_size); + buffer.resize(capacity); + host_data = buffer.data(); + host_data_end = buffer.data() + capacity; + host_ptr = Eigen::serialize(host_data, host_data_end, input_data_size); + host_ptr = Eigen::serialize(host_ptr, host_data_end, args...); + + // Copy inputs to host. + gpuMalloc((void**)(&device_data), capacity); + gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice); + GPU_CHECK(gpuDeviceSynchronize()); + + // Run kernel. + #ifdef EIGEN_USE_HIP + hipLaunchKernelGGL( + HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel), + 1, 1, 0, 0, kernel, device_data, capacity); + #else + run_serialized_on_gpu_meta_kernel<<<1,1>>>( + kernel, device_data, capacity); + #endif + // Check pre-launch and kernel execution errors. + GPU_CHECK(gpuGetLastError()); + GPU_CHECK(gpuDeviceSynchronize()); + // Copy back new output to host. + gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost); + gpuFree(device_data); + GPU_CHECK(gpuDeviceSynchronize()); + + // Determine output buffer size. + const uint8_t* c_host_ptr = Eigen::deserialize(host_data, host_data_end, output_data_size); + // If the output doesn't fit in the buffer, spit out warning and fail. + if (output_data_size > capacity) { + std::cerr << "The serialized output does not fit in the output buffer, " + << output_data_size << " vs capacity " << capacity << "." + << std::endl + << "Try specifying a minimum buffer capacity: " << std::endl + << " run_with_hint(" << output_data_size << ", ...)" + << std::endl; + VERIFY(false); + } + + // Deserialize outputs. + auto args_tuple = test_detail::tie(args...); + EIGEN_UNUSED_VARIABLE(args_tuple) // Avoid NVCC compile warning. + c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, test_detail::get(args_tuple)...); + + // Maybe deserialize return value, properly handling void. + typename void_helper::ReturnType result; + c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, result); + return void_helper::restore(result); +} + +#endif // EIGEN_GPUCC + +} // namespace internal + +/** + * Runs a kernel on the CPU, returning the results. + * \param kernel kernel to run. + * \param args ... input arguments. + * \return kernel(args...). + */ +template +auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ + return kernel(std::forward(args)...); +} + +#ifdef EIGEN_GPUCC + +/** + * Runs a kernel on the GPU, returning the results. + * + * The kernel must be able to be passed directly as an input to a global + * function (i.e. empty or POD). Its inputs must be "Serializable" so we + * can transfer them to the device, and the output must be a Serializable value + * type so it can be transferred back from the device. + * + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + */ +template +auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ + return internal::run_serialized_on_gpu( + /*buffer_capacity_hint=*/ 0, + std::make_index_sequence{}, + internal::extract_output_indices{}, + kernel, std::forward(args)...); +} + +/** + * Runs a kernel on the GPU, returning the results. + * + * This version allows specifying a minimum buffer capacity size required for + * serializing the puts to transfer results from device to host. Use this when + * `run_on_gpu(...)` fails to determine an appropriate capacity by default. + * + * \param buffer_capacity_hint minimum required buffer size for serializing + * outputs. + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + * \sa run_on_gpu + */ +template +auto run_on_gpu_with_hint(size_t buffer_capacity_hint, + Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ + return internal::run_serialized_on_gpu( + buffer_capacity_hint, + std::make_index_sequence{}, + internal::extract_output_indices{}, + kernel, std::forward(args)...); +} + +/** + * Kernel for determining basic Eigen compile-time information + * (i.e. the cuda/hip arch) + */ +struct CompileTimeDeviceInfoKernel { + struct Info { + int cuda; + int hip; + }; + + EIGEN_DEVICE_FUNC + Info operator()() const + { + Info info = {-1, -1}; + #if defined(__CUDA_ARCH__) + info.cuda = static_cast(__CUDA_ARCH__ +0); + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + info.hip = static_cast(EIGEN_HIP_DEVICE_COMPILE +0); + #endif + return info; + } +}; + +/** + * Queries and prints the compile-time and runtime GPU info. + */ +void print_gpu_device_info() +{ + int device = 0; + gpuDeviceProp_t deviceProp; + gpuGetDeviceProperties(&deviceProp, device); + + auto info = run_on_gpu(CompileTimeDeviceInfoKernel()); + + std::cout << "GPU compile-time info:\n"; + + #ifdef EIGEN_CUDACC + std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << std::endl; + #endif + + #ifdef EIGEN_CUDA_SDK_VER + std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << std::endl; + #endif + + #ifdef EIGEN_COMP_NVCC + std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << std::endl; + #endif + + #ifdef EIGEN_HIPCC + std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << std::endl; + #endif + + std::cout << " EIGEN_CUDA_ARCH: " << info.cuda << std::endl; + std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info.hip << std::endl; + + std::cout << "GPU device info:\n"; + std::cout << " name: " << deviceProp.name << std::endl; + std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << std::endl; + std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << std::endl; + std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl; + std::cout << " warpSize: " << deviceProp.warpSize << std::endl; + std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << std::endl; + std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << std::endl; + std::cout << " clockRate: " << deviceProp.clockRate << std::endl; + std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl; + std::cout << " computeMode: " << deviceProp.computeMode << std::endl; +} + +#endif // EIGEN_GPUCC + +/** + * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. + * + * This is to better support creating generic tests. + * + * The kernel must be able to be passed directly as an input to a global + * function (i.e. empty or POD). Its inputs must be "Serializable" so we + * can transfer them to the device, and the output must be a Serializable value + * type so it can be transferred back from the device. + * + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + */ +template +auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ +#ifdef EIGEN_GPUCC + return run_on_gpu(kernel, std::forward(args)...); +#else + return run_on_cpu(kernel, std::forward(args)...); +#endif +} + +/** + * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. + * + * This version allows specifying a minimum buffer capacity size required for + * serializing the puts to transfer results from device to host. Use this when + * `run(...)` fails to determine an appropriate capacity by default. + * + * \param buffer_capacity_hint minimum required buffer size for serializing + * outputs. + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + * \sa run + */ +template +auto run_with_hint(size_t buffer_capacity_hint, + Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ +#ifdef EIGEN_GPUCC + return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward(args)...); +#else + EIGEN_UNUSED_VARIABLE(buffer_capacity_hint) + return run_on_cpu(kernel, std::forward(args)...); +#endif +} + +} // namespace Eigen + +#endif // GPU_TEST_HELPER_H diff --git a/libs/eigen/test/half_float.cpp b/libs/eigen/test/half_float.cpp index 729de1b..00a8b48 100644 --- a/libs/eigen/test/half_float.cpp +++ b/libs/eigen/test/half_float.cpp @@ -157,6 +157,12 @@ void test_numtraits() VERIFY( (std::numeric_limits::denorm_min)() > half(0.f) ); VERIFY( (std::numeric_limits::min)()/half(2) > half(0.f) ); VERIFY_IS_EQUAL( (std::numeric_limits::denorm_min)()/half(2), half(0.f) ); + + // Test to see that we are able to link against the symbols for digits and + // digits10. + volatile const int& digits10 = std::numeric_limits::digits10; + volatile const int& digits = std::numeric_limits::digits; + VERIFY( (digits10) != (digits) ); } void test_arithmetic() @@ -224,6 +230,8 @@ void test_comparison() void test_basic_functions() { + constexpr float PI = static_cast(EIGEN_PI); + VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); @@ -251,8 +259,8 @@ void test_basic_functions() VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); VERIFY_IS_EQUAL(float(exp(half(0.0f))), 1.0f); - VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI)); - VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI)); + VERIFY_IS_APPROX(float(numext::exp(half(PI))), 20.f + PI); + VERIFY_IS_APPROX(float(exp(half(PI))), 20.f + PI); VERIFY_IS_EQUAL(float(numext::expm1(half(0.0f))), 0.0f); VERIFY_IS_EQUAL(float(expm1(half(0.0f))), 0.0f); @@ -277,25 +285,26 @@ void test_basic_functions() void test_trigonometric_functions() { + constexpr float PI = static_cast(EIGEN_PI); VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f))); VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f))); - VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2))); - // VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::cos(half(PI)), half(cosf(PI))); + // VERIFY_IS_APPROX(numext::cos(half(PI/2)), half(cosf(PI/2))); + // VERIFY_IS_APPROX(numext::cos(half(3*PI/2)), half(cosf(3*PI/2))); VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f))); VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f))); VERIFY_IS_APPROX(sin(half(0.0f)), half(sinf(0.0f))); - // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI))); - VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2))); + // VERIFY_IS_APPROX(numext::sin(half(PI)), half(sinf(PI))); + VERIFY_IS_APPROX(numext::sin(half(PI/2)), half(sinf(PI/2))); + VERIFY_IS_APPROX(numext::sin(half(3*PI/2)), half(sinf(3*PI/2))); VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f))); VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f))); VERIFY_IS_APPROX(tan(half(0.0f)), half(tanf(0.0f))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2))); - //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2))); + // VERIFY_IS_APPROX(numext::tan(half(PI)), half(tanf(PI))); + // VERIFY_IS_APPROX(numext::tan(half(PI/2)), half(tanf(PI/2))); + //VERIFY_IS_APPROX(numext::tan(half(3*PI/2)), half(tanf(3*PI/2))); VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f))); } diff --git a/libs/eigen/test/householder.cpp b/libs/eigen/test/householder.cpp index cad8138..3a3d047 100644 --- a/libs/eigen/test/householder.cpp +++ b/libs/eigen/test/householder.cpp @@ -30,7 +30,7 @@ template void householder(const MatrixType& m) typedef Matrix TMatrixType; - Matrix _tmp((std::max)(rows,cols)); + Matrix _tmp((std::max)(rows,cols)); Scalar* tmp = &_tmp.coeffRef(0,0); Scalar beta; @@ -133,6 +133,89 @@ template void householder(const MatrixType& m) VERIFY_IS_APPROX(m3 * m5, m1); // test evaluating rhseq to a dense matrix, then applying } + +template +void householder_update(const MatrixType& m) { + // This test is covering the internal::householder_qr_inplace_update function. + // At time of writing, there is not public API that exposes this update behavior directly, + // so we are testing the internal implementation. + + const Index rows = m.rows(); + const Index cols = m.cols(); + + typedef typename MatrixType::Scalar Scalar; + typedef Matrix VectorType; + typedef Matrix HCoeffsVectorType; + typedef Matrix MatrixX; + typedef Matrix VectorX; + + VectorX tmpOwner(cols); + Scalar* tmp = tmpOwner.data(); + + // The matrix to factorize. + const MatrixType A = MatrixType::Random(rows, cols); + + // matQR and hCoeffs will hold the factorization of A, + // built by a sequence of calls to `update`. + MatrixType matQR(rows, cols); + HCoeffsVectorType hCoeffs(cols); + + // householder_qr_inplace_update should be able to build a QR factorization one column at a time. + // We verify this by starting with an empty factorization and 'updating' one column at a time. + // After each call to update, we should have a QR factorization of the columns presented so far. + + const Index size = (std::min)(rows, cols); // QR can only go up to 'size' b/c that's full rank. + for (Index k = 0; k != size; ++k) + { + // Make a copy of the column to prevent any possibility of 'leaking' other parts of A. + const VectorType newColumn = A.col(k); + internal::householder_qr_inplace_update(matQR, hCoeffs, newColumn, k, tmp); + + // Verify Property: + // matQR.leftCols(k+1) and hCoeffs.head(k+1) hold + // a QR factorization of A.leftCols(k+1). + // This is the fundamental guarantee of householder_qr_inplace_update. + { + const MatrixX matQR_k = matQR.leftCols(k + 1); + const VectorX hCoeffs_k = hCoeffs.head(k + 1); + MatrixX R = matQR_k.template triangularView(); + MatrixX QxR = householderSequence(matQR_k, hCoeffs_k.conjugate()) * R; + VERIFY_IS_APPROX(QxR, A.leftCols(k + 1)); + } + + // Verify Property: + // A sequence of calls to 'householder_qr_inplace_update' + // should produce the same result as 'householder_qr_inplace_unblocked'. + // This is a property of the current implementation. + // If these implementations diverge in the future, + // then simply delete the test of this property. + { + MatrixX QR_at_once = A.leftCols(k + 1); + VectorX hCoeffs_at_once(k + 1); + internal::householder_qr_inplace_unblocked(QR_at_once, hCoeffs_at_once, tmp); + VERIFY_IS_APPROX(QR_at_once, matQR.leftCols(k + 1)); + VERIFY_IS_APPROX(hCoeffs_at_once, hCoeffs.head(k + 1)); + } + } + + // Verify Property: + // We can go back and update any column to have a new value, + // and get a QR factorization of the columns up to that one. + { + const Index k = internal::random(0, size - 1); + VectorType newColumn = VectorType::Random(rows); + internal::householder_qr_inplace_update(matQR, hCoeffs, newColumn, k, tmp); + + const MatrixX matQR_k = matQR.leftCols(k + 1); + const VectorX hCoeffs_k = hCoeffs.head(k + 1); + MatrixX R = matQR_k.template triangularView(); + MatrixX QxR = householderSequence(matQR_k, hCoeffs_k.conjugate()) * R; + VERIFY_IS_APPROX(QxR.leftCols(k), A.leftCols(k)); + VERIFY_IS_APPROX(QxR.col(k), newColumn); + } +} + + EIGEN_DECLARE_TEST(householder) { for(int i = 0; i < g_repeat; i++) { @@ -144,5 +227,9 @@ EIGEN_DECLARE_TEST(householder) CALL_SUBTEST_6( householder(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE),internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_7( householder(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE),internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_8( householder(Matrix()) ); + + CALL_SUBTEST_9( householder_update(Matrix()) ); + CALL_SUBTEST_9( householder_update(Matrix()) ); + CALL_SUBTEST_9( householder_update(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } } diff --git a/libs/eigen/test/indexed_view.cpp b/libs/eigen/test/indexed_view.cpp index 72c54af..d149960 100644 --- a/libs/eigen/test/indexed_view.cpp +++ b/libs/eigen/test/indexed_view.cpp @@ -7,38 +7,15 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifdef EIGEN_TEST_PART_2 -// Make sure we also check c++11 max implementation -#define EIGEN_MAX_CPP_VER 11 -#endif - -#ifdef EIGEN_TEST_PART_3 -// Make sure we also check c++98 max implementation -#define EIGEN_MAX_CPP_VER 03 - -// We need to disable this warning when compiling with c++11 while limiting Eigen to c++98 -// Ideally we would rather configure the compiler to build in c++98 mode but this needs -// to be done at the CMakeLists.txt level. -#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) - #pragma GCC diagnostic ignored "-Wdeprecated" -#endif - -#if defined(__GNUC__) && (__GNUC__ >=9) - #pragma GCC diagnostic ignored "-Wdeprecated-copy" -#endif -#if defined(__clang__) && (__clang_major__ >= 10) - #pragma clang diagnostic ignored "-Wdeprecated-copy" -#endif - -#endif - #include #include #include "main.h" -#if EIGEN_HAS_CXX11 +using Eigen::placeholders::all; +using Eigen::placeholders::last; +using Eigen::placeholders::lastp1; +using Eigen::placeholders::lastN; #include -#endif typedef std::pair IndexPair; @@ -63,7 +40,7 @@ bool match(const T& xpr, std::string ref, std::string str_xpr = "") { #define MATCH(X,R) match(X, R, #X) template -typename internal::enable_if::value,bool>::type +std::enable_if_t::value,bool> is_same_eq(const T1& a, const T2& b) { return (a == b).all(); @@ -82,7 +59,7 @@ bool is_same_seq(const T1& a, const T2& b) } template -typename internal::enable_if::value,bool>::type +std::enable_if_t::value,bool> is_same_seq_type(const T1& a, const T2& b) { return is_same_seq(a,b); @@ -102,11 +79,7 @@ void check_indexed_view() ArrayXd a = ArrayXd::LinSpaced(n,0,n-1); Array b = a.transpose(); - #if EIGEN_COMP_CXXVER>=14 ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ref(encode)); - #else - ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ptr_fun(&encode)); - #endif for(Index i=0; i(5),fix<-2>), seqN(2,fix<5>,fix<-2>()) ) ); VERIFY( is_same_seq_type( seq(2,fix<5>), seqN(2,4) ) ); -#if EIGEN_HAS_CXX11 VERIFY( is_same_seq_type( seq(fix<2>,fix<5>), seqN(fix<2>,fix<4>) ) ); VERIFY( is_same_seq( seqN(2,std::integral_constant(),std::integral_constant()), seqN(2,fix<5>,fix<-2>()) ) ); VERIFY( is_same_seq( seq(std::integral_constant(),std::integral_constant(),std::integral_constant()), @@ -231,10 +203,6 @@ void check_indexed_view() VERIFY( is_same_seq_type( seqN(2,std::integral_constant()), seqN(2,fix<5>) ) ); VERIFY( is_same_seq_type( seq(std::integral_constant(),std::integral_constant()), seq(fix<1>,fix<5>) ) ); -#else - // sorry, no compile-time size recovery in c++98/03 - VERIFY( is_same_seq( seq(fix<2>,fix<5>), seqN(fix<2>,fix<4>) ) ); -#endif VERIFY( (A(seqN(2,fix<5>), 5)).RowsAtCompileTime == 5); VERIFY( (A(4, all)).ColsAtCompileTime == Dynamic); @@ -310,7 +278,6 @@ void check_indexed_view() A(seq(last-5,last-1,2), seqN(last-3,3,fix<-2>)).reverse() ); } -#if EIGEN_HAS_CXX11 // check lastN VERIFY_IS_APPROX( a(lastN(3)), a.tail(3) ); VERIFY( MATCH( a(lastN(3)), "7\n8\n9" ) ); @@ -323,7 +290,6 @@ void check_indexed_view() VERIFY_IS_APPROX( (A(std::array{{1,3,5}}, std::array{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) ); -#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array{{3, 1, 6, 5}}, all) ); VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array{{3, 1, 6, 5}}) ); VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array{{1,3,5}},std::array{{3, 1, 6, 5}}) ); @@ -336,9 +302,6 @@ void check_indexed_view() VERIFY_IS_APPROX( b({3, 1, 6, 5}), b(std::array{{3, 1, 6, 5}}) ); VERIFY_IS_EQUAL( b({1,3,5}).SizeAtCompileTime, 3 ); -#endif - -#endif // check mat(i,j) with weird types for i and j { @@ -396,13 +359,11 @@ void check_indexed_view() a(XX) = 1; A(XX,YY) = 1; // Anonymous enums only work with C++11 -#if EIGEN_HAS_CXX11 enum { X=0, Y=1 }; a(X) = 1; A(X,Y) = 1; A(XX,Y) = 1; A(X,YY) = 1; -#endif // Check compilation of varying integer types as index types: Index i = n/2; @@ -442,13 +403,21 @@ void check_indexed_view() VERIFY( MATCH( A(all,1)(1), "101")); } -#if EIGEN_HAS_CXX11 + // bug #2375: indexing over matrices of dim >128 should compile on gcc + { + Matrix large_mat = Matrix::Random(); + std::array test_indices = {0, 1}; + Matrix thin_slice = large_mat(all, test_indices); + for(int col = 0; col < int(test_indices.size()); ++col) + for(int row = 0; row < large_mat.rows(); ++row) + VERIFY_IS_EQUAL( thin_slice(row, col), large_mat(row, col) ); + } + //Bug IndexView with a single static row should be RowMajor: { // A(1, seq(0,2,1)).cwiseAbs().colwise().replicate(2).eval(); STATIC_CHECK(( (internal::evaluator::Flags & RowMajorBit) == RowMajorBit )); } -#endif } @@ -456,8 +425,6 @@ EIGEN_DECLARE_TEST(indexed_view) { // for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_indexed_view() ); - CALL_SUBTEST_2( check_indexed_view() ); - CALL_SUBTEST_3( check_indexed_view() ); // } // static checks of some internals: diff --git a/libs/eigen/test/initializer_list_construction.cpp b/libs/eigen/test/initializer_list_construction.cpp index 7a9c49e..b576ec2 100644 --- a/libs/eigen/test/initializer_list_construction.cpp +++ b/libs/eigen/test/initializer_list_construction.cpp @@ -7,7 +7,12 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#define EIGEN_NO_STATIC_ASSERT +#if defined(__GNUC__) && __GNUC__ >= 10 +// GCC 10+ has a bug for unsigned char that thinks we're writing past the +// end of an array when compiled with -O3. This warning is not triggered for +// any other types, nor for other compilers, nor for other optimization levels. +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif #include "main.h" @@ -320,16 +325,6 @@ template void dynamicVectorConstruction() VERIFY(v.cols() == 1); VERIFY_IS_EQUAL(v, (VectorX {{raw[0], raw[1], raw[2], raw[3]}})); } - - { - VERIFY_RAISES_ASSERT((VectorX {raw[0], raw[1], raw[2], raw[3]})); - } - { - VERIFY_RAISES_ASSERT((VectorX { - {raw[0], raw[1], raw[2], raw[3]}, - {raw[0], raw[1], raw[2], raw[3]}, - })); - } } EIGEN_DECLARE_TEST(initializer_list_construction) diff --git a/libs/eigen/test/integer_types.cpp b/libs/eigen/test/integer_types.cpp index 31f4100..1322527 100644 --- a/libs/eigen/test/integer_types.cpp +++ b/libs/eigen/test/integer_types.cpp @@ -7,8 +7,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#define EIGEN_NO_STATIC_ASSERT - #include "main.h" #undef VERIFY_IS_APPROX @@ -162,12 +160,10 @@ EIGEN_DECLARE_TEST(integer_types) CALL_SUBTEST_6( integer_type_tests(Matrix()) ); -#if EIGEN_HAS_CXX11 CALL_SUBTEST_7( integer_type_tests(Matrix()) ); CALL_SUBTEST_7( signed_integer_type_tests(Matrix()) ); CALL_SUBTEST_8( integer_type_tests(Matrix(1, 5)) ); -#endif } CALL_SUBTEST_9( integer_types_extra<0>() ); } diff --git a/libs/eigen/test/inverse.cpp b/libs/eigen/test/inverse.cpp index 9cedfa1..2748c38 100644 --- a/libs/eigen/test/inverse.cpp +++ b/libs/eigen/test/inverse.cpp @@ -12,12 +12,12 @@ #include template -void inverse_for_fixed_size(const MatrixType&, typename internal::enable_if::type* = 0) +void inverse_for_fixed_size(const MatrixType&, std::enable_if_t* = 0) { } template -void inverse_for_fixed_size(const MatrixType& m1, typename internal::enable_if::type* = 0) +void inverse_for_fixed_size(const MatrixType& m1, std::enable_if_t* = 0) { using std::abs; diff --git a/libs/eigen/test/jacobi.cpp b/libs/eigen/test/jacobi.cpp index 5604797..273b94d 100644 --- a/libs/eigen/test/jacobi.cpp +++ b/libs/eigen/test/jacobi.cpp @@ -65,6 +65,11 @@ EIGEN_DECLARE_TEST(jacobi) CALL_SUBTEST_3(( jacobi() )); CALL_SUBTEST_3(( jacobi >() )); + CALL_SUBTEST_1(( jacobi, float>() )); + CALL_SUBTEST_2(( jacobi, double>() )); + CALL_SUBTEST_3(( jacobi, 4, 4, RowMajor>, float>() )); + CALL_SUBTEST_3(( jacobi, 4, 4, RowMajor>, std::complex >() )); + int r = internal::random(2, internal::random(1,EIGEN_TEST_MAX_SIZE)/2), c = internal::random(2, internal::random(1,EIGEN_TEST_MAX_SIZE)/2); CALL_SUBTEST_4(( jacobi(MatrixXf(r,c)) )); diff --git a/libs/eigen/test/jacobisvd.cpp b/libs/eigen/test/jacobisvd.cpp index 5b15c5a..daf24a7 100644 --- a/libs/eigen/test/jacobisvd.cpp +++ b/libs/eigen/test/jacobisvd.cpp @@ -8,6 +8,15 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// We explicitly disable deprecated declarations for this set of tests +// because we purposely verify assertions for the deprecated SVD runtime +// option behavior. +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#elif defined(_MSC_VER) +#pragma warning( disable : 4996 ) +#endif + // discard stack allocation as that too bypasses malloc #define EIGEN_STACK_ALLOCATION_LIMIT 0 #define EIGEN_RUNTIME_NO_MALLOC @@ -16,49 +25,9 @@ #define SVD_DEFAULT(M) JacobiSVD #define SVD_FOR_MIN_NORM(M) JacobiSVD +#define SVD_STATIC_OPTIONS(M, O) JacobiSVD #include "svd_common.h" -// Check all variants of JacobiSVD -template -void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true) -{ - MatrixType m = a; - if(pickrandom) - svd_fill_random(m); - - CALL_SUBTEST(( svd_test_all_computation_options >(m, true) )); // check full only - CALL_SUBTEST(( svd_test_all_computation_options >(m, false) )); - CALL_SUBTEST(( svd_test_all_computation_options >(m, false) )); - if(m.rows()==m.cols()) - CALL_SUBTEST(( svd_test_all_computation_options >(m, false) )); -} - -template void jacobisvd_verify_assert(const MatrixType& m) -{ - svd_verify_assert >(m); - svd_verify_assert >(m, true); - svd_verify_assert >(m); - svd_verify_assert >(m); - Index rows = m.rows(); - Index cols = m.cols(); - - enum { - ColsAtCompileTime = MatrixType::ColsAtCompileTime - }; - - - MatrixType a = MatrixType::Zero(rows, cols); - a.setZero(); - - if (ColsAtCompileTime == Dynamic) - { - JacobiSVD svd_fullqr; - VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeFullU|ComputeThinV)) - VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeThinV)) - VERIFY_RAISES_ASSERT(svd_fullqr.compute(a, ComputeThinU|ComputeFullV)) - } -} - template void jacobisvd_method() { @@ -69,11 +38,62 @@ void jacobisvd_method() VERIFY_IS_APPROX(m.jacobiSvd().singularValues(), RealVecType::Ones()); VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixU()); VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixV()); + VERIFY_IS_APPROX(m.template jacobiSvd().solve(m), m); + VERIFY_IS_APPROX(m.template jacobiSvd().transpose().solve(m), m); + VERIFY_IS_APPROX(m.template jacobiSvd().adjoint().solve(m), m); + + // Deprecated behavior. VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m); VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m); VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m); } +template +void jacobisvd_all_options(const MatrixType& input = MatrixType()) { + MatrixType m(input.rows(), input.cols()); + svd_fill_random(m); + svd_option_checks(m); + svd_option_checks(m); + svd_option_checks(m); + svd_option_checks_full_only( + m); // FullPiv only used when computing full unitaries +} + +template +void jacobisvd_verify_assert(const MatrixType& m = MatrixType()) { + svd_verify_assert(m); + svd_verify_assert(m); + svd_verify_assert(m); + svd_verify_assert_full_only(m); + + svd_verify_constructor_options_assert>(m); + svd_verify_constructor_options_assert>(m); + svd_verify_constructor_options_assert>(m); + svd_verify_constructor_options_assert>(m, true); +} + +template +void jacobisvd_verify_inputs(const MatrixType& m = MatrixType()) { + // check defaults + typedef JacobiSVD DefaultSVD; + DefaultSVD defaultSvd(m); + VERIFY((int)DefaultSVD::QRPreconditioner == (int)ColPivHouseholderQRPreconditioner); + VERIFY(!defaultSvd.computeU()); + VERIFY(!defaultSvd.computeV()); + + // ColPivHouseholderQR is always default in presence of other options. + VERIFY(((int)JacobiSVD::QRPreconditioner == (int)ColPivHouseholderQRPreconditioner)); + VERIFY(((int)JacobiSVD::QRPreconditioner == (int)ColPivHouseholderQRPreconditioner)); + VERIFY(((int)JacobiSVD::QRPreconditioner == + (int)ColPivHouseholderQRPreconditioner)); + VERIFY(((int)JacobiSVD::QRPreconditioner == + (int)ColPivHouseholderQRPreconditioner)); + VERIFY(((int)JacobiSVD::QRPreconditioner == + (int)ColPivHouseholderQRPreconditioner)); + VERIFY(((int)JacobiSVD::QRPreconditioner == + (int)ColPivHouseholderQRPreconditioner)); +} + namespace Foo { // older compiler require a default constructor for Bar // cf: https://stackoverflow.com/questions/7411515/ @@ -86,62 +106,91 @@ void msvc_workaround() { const Foo::Bar a; const Foo::Bar b; - std::max EIGEN_NOT_A_MACRO (a,b); + const Foo::Bar c = std::max EIGEN_NOT_A_MACRO (a,b); + EIGEN_UNUSED_VARIABLE(c) } EIGEN_DECLARE_TEST(jacobisvd) { - CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) )); - CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) )); - CALL_SUBTEST_7(( jacobisvd_verify_assert(MatrixXf(10,12)) )); - CALL_SUBTEST_8(( jacobisvd_verify_assert(MatrixXcd(7,5)) )); - - CALL_SUBTEST_11(svd_all_trivial_2x2(jacobisvd)); - CALL_SUBTEST_12(svd_all_trivial_2x2(jacobisvd)); + CALL_SUBTEST_1((jacobisvd_verify_inputs())); + CALL_SUBTEST_1((jacobisvd_verify_inputs(Matrix(5, 6)))); + CALL_SUBTEST_1((jacobisvd_verify_inputs, 7, 5>>())); - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_3(( jacobisvd() )); - CALL_SUBTEST_4(( jacobisvd() )); - CALL_SUBTEST_5(( jacobisvd >() )); - CALL_SUBTEST_6(( jacobisvd >(Matrix(10,2)) )); + CALL_SUBTEST_2((jacobisvd_verify_assert())); + CALL_SUBTEST_2((jacobisvd_verify_assert())); + CALL_SUBTEST_2((jacobisvd_verify_assert>())); + CALL_SUBTEST_2((jacobisvd_verify_assert>())); + CALL_SUBTEST_2((jacobisvd_verify_assert(MatrixXf(10, 12)))); + CALL_SUBTEST_2((jacobisvd_verify_assert(MatrixXcd(7, 5)))); + CALL_SUBTEST_3(svd_all_trivial_2x2(jacobisvd_all_options)); + CALL_SUBTEST_4(svd_all_trivial_2x2(jacobisvd_all_options)); + + for (int i = 0; i < g_repeat; i++) { int r = internal::random(1, 30), c = internal::random(1, 30); TEST_SET_BUT_UNUSED_VARIABLE(r) TEST_SET_BUT_UNUSED_VARIABLE(c) - CALL_SUBTEST_10(( jacobisvd(MatrixXd(r,c)) )); - CALL_SUBTEST_7(( jacobisvd(MatrixXf(r,c)) )); - CALL_SUBTEST_8(( jacobisvd(MatrixXcd(r,c)) )); - (void) r; - (void) c; + CALL_SUBTEST_5((jacobisvd_all_options())); + CALL_SUBTEST_6((jacobisvd_all_options())); + CALL_SUBTEST_7((jacobisvd_all_options>())); + CALL_SUBTEST_8((jacobisvd_all_options>())); + CALL_SUBTEST_9((jacobisvd_all_options>())); + CALL_SUBTEST_10((jacobisvd_all_options>(Matrix(r, 5)))); + CALL_SUBTEST_11((jacobisvd_all_options>(Matrix(5, c)))); + CALL_SUBTEST_12((jacobisvd_all_options(MatrixXf(r, c)))); + CALL_SUBTEST_13((jacobisvd_all_options(MatrixXcd(r, c)))); + CALL_SUBTEST_14((jacobisvd_all_options(MatrixXd(r, c)))); + CALL_SUBTEST_15((jacobisvd_all_options>())); + CALL_SUBTEST_16((jacobisvd_all_options>())); + + MatrixXcd noQRTest = MatrixXcd(r, r); + svd_fill_random(noQRTest); + CALL_SUBTEST_17((svd_option_checks(noQRTest))); + + CALL_SUBTEST_18(( + svd_check_max_size_matrix, ColPivHouseholderQRPreconditioner>( + r, c))); + CALL_SUBTEST_18( + (svd_check_max_size_matrix, HouseholderQRPreconditioner>(r, + c))); + CALL_SUBTEST_18(( + svd_check_max_size_matrix, ColPivHouseholderQRPreconditioner>( + r, c))); + CALL_SUBTEST_18( + (svd_check_max_size_matrix, HouseholderQRPreconditioner>(r, + c))); // Test on inf/nan matrix - CALL_SUBTEST_7( (svd_inf_nan, MatrixXf>()) ); - CALL_SUBTEST_10( (svd_inf_nan, MatrixXd>()) ); + CALL_SUBTEST_19((svd_inf_nan())); + CALL_SUBTEST_19((svd_inf_nan())); - // bug1395 test compile-time vectors as input - CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix()) )); - CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix()) )); - CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix(r)) )); - CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix(c)) )); + CALL_SUBTEST_20((jacobisvd_verify_assert>())); + CALL_SUBTEST_20((jacobisvd_verify_assert>())); + CALL_SUBTEST_20((jacobisvd_verify_assert>(Matrix(r)))); + CALL_SUBTEST_20((jacobisvd_verify_assert>(Matrix(c)))); } - CALL_SUBTEST_7(( jacobisvd(MatrixXf(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); - CALL_SUBTEST_8(( jacobisvd(MatrixXcd(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/3))) )); + CALL_SUBTEST_21((jacobisvd_all_options( + MatrixXd(internal::random(EIGEN_TEST_MAX_SIZE / 4, EIGEN_TEST_MAX_SIZE / 2), + internal::random(EIGEN_TEST_MAX_SIZE / 4, EIGEN_TEST_MAX_SIZE / 2))))); + CALL_SUBTEST_22((jacobisvd_all_options( + MatrixXcd(internal::random(EIGEN_TEST_MAX_SIZE / 4, EIGEN_TEST_MAX_SIZE / 3), + internal::random(EIGEN_TEST_MAX_SIZE / 4, EIGEN_TEST_MAX_SIZE / 3))))); // test matrixbase method - CALL_SUBTEST_1(( jacobisvd_method() )); - CALL_SUBTEST_3(( jacobisvd_method() )); + CALL_SUBTEST_23(( jacobisvd_method() )); + CALL_SUBTEST_23(( jacobisvd_method() )); // Test problem size constructors - CALL_SUBTEST_7( JacobiSVD(10,10) ); + CALL_SUBTEST_24( JacobiSVD(10,10) ); // Check that preallocation avoids subsequent mallocs - CALL_SUBTEST_9( svd_preallocate() ); + CALL_SUBTEST_25( svd_preallocate() ); - CALL_SUBTEST_2( svd_underoverflow() ); + CALL_SUBTEST_26( svd_underoverflow() ); msvc_workaround(); } diff --git a/libs/eigen/test/main.h b/libs/eigen/test/main.h index 07f3794..a52da9e 100644 --- a/libs/eigen/test/main.h +++ b/libs/eigen/test/main.h @@ -1,4 +1,3 @@ - // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // @@ -23,7 +22,7 @@ // The following includes of STL headers have to be done _before_ the // definition of macros min() and max(). The reason is that many STL // implementations will not work properly as the min and max symbols collide -// with the STL functions std:min() and std::max(). The STL headers may check +// with the STL functions std::min() and std::max(). The STL headers may check // for the macro definition of min/max and issue a warning or undefine the // macros. // @@ -54,27 +53,41 @@ #include #endif #endif +#if __cplusplus > 201703L +// libstdc++ 9's indirectly uses max() via . +// libstdc++ 10's indirectly uses max() via ranges headers. +#include +// libstdc++ 11's indirectly uses max() via semaphore headers. +#include +#endif -// Same for cuda_fp16.h -#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) - // Means the compiler is either nvcc or clang with CUDA enabled +// Configure GPU. +#if defined(EIGEN_USE_HIP) + #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + #define EIGEN_HIPCC __HIPCC__ + #include + #include + #endif +#elif defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) #define EIGEN_CUDACC __CUDACC__ + #include + #include + #include + #if CUDA_VERSION >= 7050 + #include + #endif #endif -#if defined(EIGEN_CUDACC) -#include - #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) -#else - #define EIGEN_CUDA_SDK_VER 0 -#endif -#if EIGEN_CUDA_SDK_VER >= 70500 -#include + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) + #define EIGEN_TEST_NO_LONGDOUBLE + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #endif // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL) +#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_POCKETFFT_DEFAULT) // // HIP header files include the following files // @@ -289,9 +302,8 @@ namespace Eigen #endif //EIGEN_EXCEPTIONS #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS - // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3 #define eigen_assert(a) \ - if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ + if( (!(a)) && (!no_more_assert) ) \ { \ Eigen::no_more_assert = true; \ if(report_on_cerr_on_assert_failure) \ @@ -314,36 +326,10 @@ namespace Eigen #endif // EIGEN_EXCEPTIONS #endif // EIGEN_DEBUG_ASSERTS - #if defined(TEST_CHECK_STATIC_ASSERTIONS) && defined(EIGEN_EXCEPTIONS) - #define EIGEN_STATIC_ASSERT(a,MSG) \ - if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ - { \ - Eigen::no_more_assert = true; \ - if(report_on_cerr_on_assert_failure) \ - eigen_plain_assert((a) && #MSG); \ - else \ - EIGEN_THROW_X(Eigen::eigen_static_assert_exception()); \ - } - #define VERIFY_RAISES_STATIC_ASSERT(a) { \ - Eigen::no_more_assert = false; \ - Eigen::report_on_cerr_on_assert_failure = false; \ - try { \ - a; \ - VERIFY(Eigen::should_raise_an_assert && # a); \ - } \ - catch (Eigen::eigen_static_assert_exception&) { VERIFY(true); } \ - Eigen::report_on_cerr_on_assert_failure = true; \ - } - #endif // TEST_CHECK_STATIC_ASSERTIONS - #ifndef VERIFY_RAISES_ASSERT #define VERIFY_RAISES_ASSERT(a) \ std::cout << "Can't VERIFY_RAISES_ASSERT( " #a " ) with exceptions disabled\n"; #endif -#ifndef VERIFY_RAISES_STATIC_ASSERT - #define VERIFY_RAISES_STATIC_ASSERT(a) \ - std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n"; -#endif #if !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) #define EIGEN_USE_CUSTOM_ASSERT @@ -352,12 +338,11 @@ namespace Eigen #else // EIGEN_NO_ASSERTION_CHECKING #define VERIFY_RAISES_ASSERT(a) {} - #define VERIFY_RAISES_STATIC_ASSERT(a) {} #endif // EIGEN_NO_ASSERTION_CHECKING #define EIGEN_INTERNAL_DEBUGGING -#include // required for createRandomPIMatrixOfRank +#include // required for createRandomPIMatrixOfRank and generateRandomMatrixSvs inline void verify_impl(bool condition, const char *testname, const char *file, int line, const char *condition_as_string) { @@ -391,6 +376,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY_IS_NOT_MUCH_SMALLER_THAN(a, b) VERIFY(!test_isMuchSmallerThan(a, b)) #define VERIFY_IS_APPROX_OR_LESS_THAN(a, b) VERIFY(test_isApproxOrLessThan(a, b)) #define VERIFY_IS_NOT_APPROX_OR_LESS_THAN(a, b) VERIFY(!test_isApproxOrLessThan(a, b)) +#define VERIFY_IS_CWISE_EQUAL(a, b) VERIFY(verifyIsCwiseApprox(a, b, true)) +#define VERIFY_IS_CWISE_APPROX(a, b) VERIFY(verifyIsCwiseApprox(a, b, false)) #define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a)) @@ -403,10 +390,25 @@ inline void verify_impl(bool condition, const char *testname, const char *file, } while (0) +// Forward declarations to avoid ICC warnings +#if EIGEN_COMP_ICC + +template std::string type_name(); + +namespace Eigen { + +template +bool test_is_equal(const T& actual, const U& expected, bool expect_equal=true); + +} // end namespace Eigen + +#endif // EIGEN_COMP_ICC + + namespace Eigen { template -typename internal::enable_if::value,bool>::type +std::enable_if_t::value,bool> is_same_type(const T1&, const T2&) { return true; @@ -422,7 +424,13 @@ template<> inline long double test_precision >() { ret #define EIGEN_TEST_SCALAR_TEST_OVERLOAD(TYPE) \ inline bool test_isApprox(TYPE a, TYPE b) \ - { return internal::isApprox(a, b, test_precision()); } \ + { return numext::equal_strict(a, b) || \ + ((numext::isnan)(a) && (numext::isnan)(b)) || \ + (internal::isApprox(a, b, test_precision())); } \ + inline bool test_isCwiseApprox(TYPE a, TYPE b, bool exact) \ + { return numext::equal_strict(a, b) || \ + ((numext::isnan)(a) && (numext::isnan)(b)) || \ + (!exact && internal::isApprox(a, b, test_precision())); } \ inline bool test_isMuchSmallerThan(TYPE a, TYPE b) \ { return internal::isMuchSmallerThan(a, b, test_precision()); } \ inline bool test_isApproxOrLessThan(TYPE a, TYPE b) \ @@ -434,10 +442,8 @@ EIGEN_TEST_SCALAR_TEST_OVERLOAD(int) EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned int) EIGEN_TEST_SCALAR_TEST_OVERLOAD(long) EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long) -#if EIGEN_HAS_CXX11 EIGEN_TEST_SCALAR_TEST_OVERLOAD(long long) EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long long) -#endif EIGEN_TEST_SCALAR_TEST_OVERLOAD(float) EIGEN_TEST_SCALAR_TEST_OVERLOAD(double) EIGEN_TEST_SCALAR_TEST_OVERLOAD(half) @@ -543,7 +549,7 @@ typename T1::RealScalar test_relative_error(const SparseMatrixBase &a, const } template -typename NumTraits::Real>::NonInteger test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if::Real>::value, T1>::type* = 0) +typename NumTraits::Real>::NonInteger test_relative_error(const T1 &a, const T2 &b, std::enable_if_t::Real>::value, T1>* = 0) { typedef typename NumTraits::Real>::NonInteger RealScalar; return numext::sqrt(RealScalar(numext::abs2(a-b))/(numext::mini)(RealScalar(numext::abs2(a)),RealScalar(numext::abs2(b)))); @@ -575,7 +581,7 @@ typename NumTraits::Real get_test_precision(const T&, const } template -typename NumTraits::Real get_test_precision(const T&,typename internal::enable_if::Real>::value, T>::type* = 0) +typename NumTraits::Real get_test_precision(const T&,std::enable_if_t::Real>::value, T>* = 0) { return test_precision::Real>(); } @@ -592,6 +598,22 @@ inline bool verifyIsApprox(const Type1& a, const Type2& b) return ret; } +// verifyIsCwiseApprox is a wrapper to test_isCwiseApprox that outputs the relative difference magnitude if the test fails. +template +inline bool verifyIsCwiseApprox(const Type1& a, const Type2& b, bool exact) +{ + bool ret = test_isCwiseApprox(a,b,exact); + if(!ret) { + if (exact) { + std::cerr << "Values are not an exact match"; + } else { + std::cerr << "Difference too large wrt tolerance " << get_test_precision(a); + } + std::cerr << ", relative error is: " << test_relative_error(a,b) << std::endl; + } + return ret; +} + // The idea behind this function is to compare the two scalars a and b where // the scalar ref is a hint about the expected order of magnitude of a and b. // WARNING: the scalar a and b must be positive @@ -625,14 +647,39 @@ inline bool test_isUnitary(const MatrixBase& m) return m.isUnitary(test_precision::Scalar>()); } -// Forward declaration to avoid ICC warning -template -bool test_is_equal(const T& actual, const U& expected, bool expect_equal=true); +// Checks component-wise, works with infs and nans. +template +bool test_isCwiseApprox(const DenseBase& m1, + const DenseBase& m2, + bool exact) { + if (m1.rows() != m2.rows()) { + return false; + } + if (m1.cols() != m2.cols()) { + return false; + } + for (Index r = 0; r < m1.rows(); ++r) { + for (Index c = 0; c < m1.cols(); ++c) { + if (m1(r, c) != m2(r, c) + && !((numext::isnan)(m1(r, c)) && (numext::isnan)(m2(r, c))) + && (exact || !test_isApprox(m1(r, c), m2(r, c)))) { + return false; + } + } + } + return true; +} + +template +bool test_isCwiseApprox(const SparseMatrixBase& m1, + const SparseMatrixBase& m2, bool exact) { + return test_isCwiseApprox(m1.toDense(), m2.toDense(), exact); +} template bool test_is_equal(const T& actual, const U& expected, bool expect_equal) { - if ((actual==expected) == expect_equal) + if (numext::equal_strict(actual, expected) == expect_equal) return true; // false: std::cerr @@ -641,80 +688,39 @@ bool test_is_equal(const T& actual, const U& expected, bool expect_equal) return false; } -/** Creates a random Partial Isometry matrix of given rank. - * - * A partial isometry is a matrix all of whose singular values are either 0 or 1. - * This is very useful to test rank-revealing algorithms. - */ -// Forward declaration to avoid ICC warning -template -void createRandomPIMatrixOfRank(Index desired_rank, Index rows, Index cols, MatrixType& m); -template -void createRandomPIMatrixOfRank(Index desired_rank, Index rows, Index cols, MatrixType& m) -{ - typedef typename internal::traits::Scalar Scalar; - enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime }; - typedef Matrix VectorType; - typedef Matrix MatrixAType; - typedef Matrix MatrixBType; - - if(desired_rank == 0) - { - m.setZero(rows,cols); - return; - } - - if(desired_rank == 1) - { - // here we normalize the vectors to get a partial isometry - m = VectorType::Random(rows).normalized() * VectorType::Random(cols).normalized().transpose(); - return; - } - - MatrixAType a = MatrixAType::Random(rows,rows); - MatrixType d = MatrixType::Identity(rows,cols); - MatrixBType b = MatrixBType::Random(cols,cols); - - // set the diagonal such that only desired_rank non-zero entries reamain - const Index diag_size = (std::min)(d.rows(),d.cols()); - if(diag_size != desired_rank) - d.diagonal().segment(desired_rank, diag_size-desired_rank) = VectorType::Zero(diag_size-desired_rank); - - HouseholderQR qra(a); - HouseholderQR qrb(b); - m = qra.householderQ() * d * qrb.householderQ(); -} - -// Forward declaration to avoid ICC warning -template -void randomPermutationVector(PermutationVectorType& v, Index size); -template -void randomPermutationVector(PermutationVectorType& v, Index size) -{ - typedef typename PermutationVectorType::Scalar Scalar; - v.resize(size); - for(Index i = 0; i < size; ++i) v(i) = Scalar(i); - if(size == 1) return; - for(Index n = 0; n < 3 * size; ++n) - { - Index i = internal::random(0, size-1); - Index j; - do j = internal::random(0, size-1); while(j==i); - std::swap(v(i), v(j)); - } -} +/** + * Check if number is "not a number" (NaN). + * + * @tparam T input type + * @param x input value + * @return true, if input value is "not a number" (NaN) + */ template bool isNotNaN(const T& x) { return x==x; } +/** + * Check if number is plus infinity. + * + * @tparam T input type + * @param x input value + * @return true, if input value is plus infinity + */ template bool isPlusInf(const T& x) { return x > NumTraits::highest(); } +/** + * Check if number is minus infinity. + * + * @tparam T input type + * @param x input value + * @return true, if input value is minus infinity + */ template bool isMinusInf(const T& x) { return x < NumTraits::lowest(); @@ -722,6 +728,10 @@ template bool isMinusInf(const T& x) } // end namespace Eigen + +#include "random_matrix_helper.h" + + template struct GetDifferentType; template<> struct GetDifferentType { typedef double type; }; @@ -729,8 +739,6 @@ template<> struct GetDifferentType { typedef float type; }; template struct GetDifferentType > { typedef std::complex::type> type; }; -// Forward declaration to avoid ICC warning -template std::string type_name(); template std::string type_name() { return "other"; } template<> std::string type_name() { return "float"; } template<> std::string type_name() { return "double"; } @@ -743,6 +751,11 @@ template<> std::string type_name >() { return "comple using namespace Eigen; +/** + * Set number of repetitions for unit test from input string. + * + * @param str input string + */ inline void set_repeat_from_string(const char *str) { errno = 0; @@ -755,6 +768,11 @@ inline void set_repeat_from_string(const char *str) g_has_set_repeat = true; } +/** + * Set seed for randomized unit tests from input string. + * + * @param str input string + */ inline void set_seed_from_string(const char *str) { errno = 0; @@ -855,3 +873,5 @@ int main(int argc, char *argv[]) // 4503 - decorated name length exceeded, name was truncated #pragma warning( disable : 4503) #endif + +#include "gpu_test_helper.h" diff --git a/libs/eigen/test/mapped_matrix.cpp b/libs/eigen/test/mapped_matrix.cpp index 0ea136a..1dd6959 100644 --- a/libs/eigen/test/mapped_matrix.cpp +++ b/libs/eigen/test/mapped_matrix.cpp @@ -7,10 +7,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_NO_STATIC_ASSERT -#define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them -#endif - #include "main.h" #define EIGEN_TESTMAP_MAX_SIZE 256 @@ -24,7 +20,9 @@ template void map_class_vector(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3; + // In case of no alignment, avoid division by zero. + constexpr int alignment = (std::max)(EIGEN_MAX_ALIGN_BYTES, 1); + Scalar* array3unaligned = (internal::UIntPtr(array3)%alignment) == 0 ? array3+1 : array3; Scalar array4[EIGEN_TESTMAP_MAX_SIZE]; Map(array1, size) = VectorType::Random(size); @@ -64,7 +62,9 @@ template void map_class_matrix(const MatrixType& m) Scalar* array3 = new Scalar[size+1]; Index sizep1 = size + 1; // <- without this temporary MSVC 2103 generates bad code for(Index i = 0; i < sizep1; i++) array3[i] = Scalar(1); - Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3; + // In case of no alignment, avoid division by zero. + constexpr int alignment = (std::max)(EIGEN_MAX_ALIGN_BYTES, 1); + Scalar* array3unaligned = (internal::UIntPtr(array3)%alignment) == 0 ? array3+1 : array3; Scalar array4[256]; if(size<=256) for(int i = 0; i < size; i++) array4[i] = Scalar(1); @@ -127,7 +127,9 @@ template void map_static_methods(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3; + // In case of no alignment, avoid division by zero. + constexpr int alignment = (std::max)(EIGEN_MAX_ALIGN_BYTES, 1); + Scalar* array3unaligned = (internal::UIntPtr(array3)%alignment) == 0 ? array3+1 : array3; VectorType::MapAligned(array1, size) = VectorType::Random(size); VectorType::Map(array2, size) = VectorType::Map(array1, size); @@ -150,7 +152,7 @@ template void check_const_correctness(const PlainObjec // CMake can help with that. // verify that map-to-const don't have LvalueBit - typedef typename internal::add_const::type ConstPlainObjectType; + typedef std::add_const_t ConstPlainObjectType; VERIFY( !(internal::traits >::Flags & LvalueBit) ); VERIFY( !(internal::traits >::Flags & LvalueBit) ); VERIFY( !(Map::Flags & LvalueBit) ); diff --git a/libs/eigen/test/mapstride.cpp b/libs/eigen/test/mapstride.cpp index fde73f2..42ceb0c 100644 --- a/libs/eigen/test/mapstride.cpp +++ b/libs/eigen/test/mapstride.cpp @@ -29,8 +29,8 @@ template void map_class_vector(const VectorTy map = v; for(int i = 0; i < size; ++i) { - VERIFY(array[3*i] == v[i]); - VERIFY(map[i] == v[i]); + VERIFY_IS_EQUAL(array[3*i], v[i]); + VERIFY_IS_EQUAL(map[i], v[i]); } } @@ -39,8 +39,8 @@ template void map_class_vector(const VectorTy map = v; for(int i = 0; i < size; ++i) { - VERIFY(array[2*i] == v[i]); - VERIFY(map[i] == v[i]); + VERIFY_IS_EQUAL(array[2*i], v[i]); + VERIFY_IS_EQUAL(map[i], v[i]); } } @@ -65,10 +65,13 @@ template void map_class_matrix(const MatrixTy Scalar a_array2[256]; Scalar* array2 = a_array2; - if(Alignment!=Aligned) + if(Alignment!=Aligned) { array2 = (Scalar*)(internal::IntPtr(a_array2) + (internal::packet_traits::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits::Real))); - else - array2 = (Scalar*)(((internal::UIntPtr(a_array2)+EIGEN_MAX_ALIGN_BYTES-1)/EIGEN_MAX_ALIGN_BYTES)*EIGEN_MAX_ALIGN_BYTES); + } else { + // In case there is no alignment, default to pointing to the start. + constexpr int alignment = (std::max)(EIGEN_MAX_ALIGN_BYTES, 1); + array2 = (Scalar*)(((internal::UIntPtr(a_array2)+alignment-1)/alignment)*alignment); + } Index maxsize2 = a_array2 - array2 + 256; // test no inner stride and some dynamic outer stride @@ -84,8 +87,8 @@ template void map_class_matrix(const MatrixTy for(int i = 0; i < m.outerSize(); ++i) for(int j = 0; j < m.innerSize(); ++j) { - VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j)); - VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(array[map.outerStride()*i+j], m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(map.coeffByOuterInner(i,j), m.coeffByOuterInner(i,j)); } VERIFY_IS_APPROX(s1*map,s1*m); map *= s1; @@ -111,8 +114,8 @@ template void map_class_matrix(const MatrixTy for(int i = 0; i < m.outerSize(); ++i) for(int j = 0; j < m.innerSize(); ++j) { - VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j)); - VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(array[map.outerStride()*i+j], m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(map.coeffByOuterInner(i,j), m.coeffByOuterInner(i,j)); } VERIFY_IS_APPROX(s1*map,s1*m); map *= s1; @@ -133,8 +136,8 @@ template void map_class_matrix(const MatrixTy for(int i = 0; i < m.outerSize(); ++i) for(int j = 0; j < m.innerSize(); ++j) { - VERIFY(array[map.outerStride()*i+map.innerStride()*j] == m.coeffByOuterInner(i,j)); - VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(array[map.outerStride()*i+map.innerStride()*j], m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(map.coeffByOuterInner(i,j), m.coeffByOuterInner(i,j)); } VERIFY_IS_APPROX(s1*map,s1*m); map *= s1; @@ -154,8 +157,8 @@ template void map_class_matrix(const MatrixTy for(int i = 0; i < m.outerSize(); ++i) for(int j = 0; j < m.innerSize(); ++j) { - VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j)); - VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(array[map.innerSize()*i*2+j*2], m.coeffByOuterInner(i,j)); + VERIFY_IS_EQUAL(map.coeffByOuterInner(i,j), m.coeffByOuterInner(i,j)); } VERIFY_IS_APPROX(s1*map,s1*m); map *= s1; diff --git a/libs/eigen/test/meta.cpp b/libs/eigen/test/meta.cpp index 7a8b93c..cac7af1 100644 --- a/libs/eigen/test/meta.cpp +++ b/libs/eigen/test/meta.cpp @@ -29,47 +29,28 @@ struct MyImpl : public MyInterface { EIGEN_DECLARE_TEST(meta) { - VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value)); VERIFY(( internal::is_same::value)); VERIFY((!internal::is_same::value)); VERIFY((!internal::is_same::value)); VERIFY((!internal::is_same::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - - // test add_const - VERIFY(( internal::is_same< internal::add_const::type, const float >::value)); - VERIFY(( internal::is_same< internal::add_const::type, float* const>::value)); - VERIFY(( internal::is_same< internal::add_const::type, float const* const>::value)); - VERIFY(( internal::is_same< internal::add_const::type, float& >::value)); - - // test remove_const - VERIFY(( internal::is_same< internal::remove_const::type, float const* >::value)); - VERIFY(( internal::is_same< internal::remove_const::type, float const* >::value)); - VERIFY(( internal::is_same< internal::remove_const::type, float* >::value)); + VERIFY(( internal::is_same >::value)); + VERIFY(( internal::is_same >::value)); + VERIFY(( internal::is_same >::value)); + VERIFY(( internal::is_same >::value)); + VERIFY(( internal::is_same >::value)); + VERIFY(( internal::is_same >::value)); + VERIFY(( internal::is_same >::value)); // test add_const_on_value_type - VERIFY(( internal::is_same< internal::add_const_on_value_type::type, float const& >::value)); - VERIFY(( internal::is_same< internal::add_const_on_value_type::type, float const* >::value)); + VERIFY(( internal::is_same< internal::add_const_on_value_type_t, float const& >::value)); + VERIFY(( internal::is_same< internal::add_const_on_value_type_t, float const* >::value)); - VERIFY(( internal::is_same< internal::add_const_on_value_type::type, const float >::value)); - VERIFY(( internal::is_same< internal::add_const_on_value_type::type, const float >::value)); - - VERIFY(( internal::is_same< internal::add_const_on_value_type::type, const float* const>::value)); - VERIFY(( internal::is_same< internal::add_const_on_value_type::type, const float* const>::value)); - - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_same::type >::value)); + VERIFY(( internal::is_same< internal::add_const_on_value_type_t, const float >::value)); + VERIFY(( internal::is_same< internal::add_const_on_value_type_t, const float >::value)); + VERIFY(( internal::is_same< internal::add_const_on_value_type_t, const float* const>::value)); + VERIFY(( internal::is_same< internal::add_const_on_value_type_t, const float* const>::value)); // is_convertible STATIC_CHECK(( internal::is_convertible::value )); @@ -114,13 +95,7 @@ EIGEN_DECLARE_TEST(meta) // So the following tests are expected to fail with recent compilers. STATIC_CHECK(( !internal::is_convertible::value )); - #if (!EIGEN_COMP_GNUC_STRICT) || (EIGEN_GNUC_AT_LEAST(4,8)) - // GCC prior to 4.8 fails to compile this test: - // error: cannot allocate an object of abstract type 'MyInterface' - // In other word, it does not obey SFINAE. - // Nevertheless, we don't really care about supporting abstract type as scalar type! STATIC_CHECK(( !internal::is_convertible::value )); - #endif STATIC_CHECK(( internal::is_convertible::value )); #endif diff --git a/libs/eigen/test/mixingtypes.cpp b/libs/eigen/test/mixingtypes.cpp index d450dbf..fe760b7 100644 --- a/libs/eigen/test/mixingtypes.cpp +++ b/libs/eigen/test/mixingtypes.cpp @@ -10,10 +10,6 @@ #if defined(EIGEN_TEST_PART_7) -#ifndef EIGEN_NO_STATIC_ASSERT -#define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them -#endif - // ignore double-promotion diagnostic for clang and gcc, if we check for static assertion anyway: // TODO do the same for MSVC? #if defined(__clang__) @@ -49,28 +45,6 @@ using namespace std; VERIFY_IS_APPROX(XPR,REF); \ VERIFY( g_called && #XPR" not properly optimized"); -template -void raise_assertion(Index size = SizeAtCompileType) -{ - // VERIFY_RAISES_ASSERT(mf+md); // does not even compile - Matrix vf; vf.setRandom(size); - Matrix vd; vd.setRandom(size); - VERIFY_RAISES_ASSERT(vf=vd); - VERIFY_RAISES_ASSERT(vf+=vd); - VERIFY_RAISES_ASSERT(vf-=vd); - VERIFY_RAISES_ASSERT(vd=vf); - VERIFY_RAISES_ASSERT(vd+=vf); - VERIFY_RAISES_ASSERT(vd-=vf); - - // vd.asDiagonal() * mf; // does not even compile - // vcd.asDiagonal() * mf; // does not even compile - -#if 0 // we get other compilation errors here than just static asserts - VERIFY_RAISES_ASSERT(vd.dot(vf)); -#endif -} - - template void mixingtypes(int size = SizeAtCompileType) { typedef std::complex CF; @@ -139,11 +113,12 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_MIX_SCALAR(scd - vd.array() , scd - vd.template cast >().array()); // check scalar powers - VERIFY_MIX_SCALAR( pow(vcf.array(), sf), Eigen::pow(vcf.array(), complex(sf)) ); - VERIFY_MIX_SCALAR( vcf.array().pow(sf) , Eigen::pow(vcf.array(), complex(sf)) ); + // NOTE: scalar exponents use a unary op. + VERIFY_IS_APPROX( pow(vcf.array(), sf), Eigen::pow(vcf.array(), complex(sf)) ); + VERIFY_IS_APPROX( vcf.array().pow(sf) , Eigen::pow(vcf.array(), complex(sf)) ); VERIFY_MIX_SCALAR( pow(sd, vcd.array()), Eigen::pow(complex(sd), vcd.array()) ); - VERIFY_MIX_SCALAR( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast >().array(), scf) ); - VERIFY_MIX_SCALAR( vf.array().pow(scf) , Eigen::pow(vf.template cast >().array(), scf) ); + VERIFY_IS_APPROX( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast >().array(), scf) ); + VERIFY_IS_APPROX( vf.array().pow(scf) , Eigen::pow(vf.template cast >().array(), scf) ); VERIFY_MIX_SCALAR( Eigen::pow(scd, vd.array()), Eigen::pow(scd, vd.template cast >().array()) ); // check dot product @@ -320,10 +295,5 @@ EIGEN_DECLARE_TEST(mixingtypes) CALL_SUBTEST_4(mixingtypes<3>()); CALL_SUBTEST_5(mixingtypes<4>()); CALL_SUBTEST_6(mixingtypes(internal::random(1,EIGEN_TEST_MAX_SIZE))); - CALL_SUBTEST_7(raise_assertion(internal::random(1,EIGEN_TEST_MAX_SIZE))); } - CALL_SUBTEST_7(raise_assertion<0>()); - CALL_SUBTEST_7(raise_assertion<3>()); - CALL_SUBTEST_7(raise_assertion<4>()); - CALL_SUBTEST_7(raise_assertion(0)); } diff --git a/libs/eigen/test/nestbyvalue.cpp b/libs/eigen/test/nestbyvalue.cpp index c5356bc..c25f0bf 100644 --- a/libs/eigen/test/nestbyvalue.cpp +++ b/libs/eigen/test/nestbyvalue.cpp @@ -26,12 +26,14 @@ EIGEN_DECLARE_TEST(nestbyvalue) for(int i = 0; i < g_repeat; i++) { Index rows = internal::random(1,EIGEN_TEST_MAX_SIZE); Index cols = internal::random(1,EIGEN_TEST_MAX_SIZE); - MatrixXd a = MatrixXd(rows,cols); + MatrixXd a = MatrixXd::Random(rows,cols); nb_temporaries = 0; XprType x = get_xpr_with_temps(a); VERIFY_IS_EQUAL(nb_temporaries,6); MatrixXd b = x; VERIFY_IS_EQUAL(nb_temporaries,6+1); VERIFY_IS_APPROX(b, a.rowwise().reverse().eval() + (a+a).eval()); + // Block expressions work with dense NestByValue. + VERIFY_IS_APPROX(b, a.nestByValue().rowwise().reverse().eval() + (a.nestByValue()+a.nestByValue()).eval()); } } diff --git a/libs/eigen/test/nesting_ops.cpp b/libs/eigen/test/nesting_ops.cpp index 4b5fc21..1350994 100644 --- a/libs/eigen/test/nesting_ops.cpp +++ b/libs/eigen/test/nesting_ops.cpp @@ -27,7 +27,7 @@ template bool verify_eval_type(const XprType &, const ReferenceType&) { typedef typename internal::nested_eval::type EvalType; - return internal::is_same::type, typename internal::remove_all::type>::value; + return internal::is_same, internal::remove_all_t>::value; } template void run_nesting_ops_1(const MatrixType& _m) diff --git a/libs/eigen/test/nomalloc.cpp b/libs/eigen/test/nomalloc.cpp index cb4c073..689a4cc 100644 --- a/libs/eigen/test/nomalloc.cpp +++ b/libs/eigen/test/nomalloc.cpp @@ -152,7 +152,7 @@ void ctms_decompositions() x = fpQR.solve(b); // SVD module - Eigen::JacobiSVD jSVD; jSVD.compute(A, ComputeFullU | ComputeFullV); + Eigen::JacobiSVD jSVD; jSVD.compute(A); } void test_zerosized() { diff --git a/libs/eigen/test/nullary.cpp b/libs/eigen/test/nullary.cpp index 9b25ea4..e524837 100644 --- a/libs/eigen/test/nullary.cpp +++ b/libs/eigen/test/nullary.cpp @@ -13,24 +13,20 @@ template bool equalsIdentity(const MatrixType& A) { - typedef typename MatrixType::Scalar Scalar; - Scalar zero = static_cast(0); - bool offDiagOK = true; for (Index i = 0; i < A.rows(); ++i) { for (Index j = i+1; j < A.cols(); ++j) { - offDiagOK = offDiagOK && (A(i,j) == zero); + offDiagOK = offDiagOK && numext::is_exactly_zero(A(i, j)); } } for (Index i = 0; i < A.rows(); ++i) { for (Index j = 0; j < (std::min)(i, A.cols()); ++j) { - offDiagOK = offDiagOK && (A(i,j) == zero); + offDiagOK = offDiagOK && numext::is_exactly_zero(A(i, j)); } } bool diagOK = (A.diagonal().array() == 1).all(); return offDiagOK && diagOK; - } template @@ -82,8 +78,9 @@ void testVectorType(const VectorType& base) const Scalar step = ((size == 1) ? 1 : (high-low)/RealScalar(size-1)); // check whether the result yields what we expect it to do - VectorType m(base); + VectorType m(base), o(base); m.setLinSpaced(size,low,high); + o.setEqualSpaced(size, low, step); if(!NumTraits::IsInteger) { @@ -91,6 +88,7 @@ void testVectorType(const VectorType& base) for (int i=0; i diff --git a/libs/eigen/test/num_dimensions.cpp b/libs/eigen/test/num_dimensions.cpp index 7ad7ef6..528c8f6 100644 --- a/libs/eigen/test/num_dimensions.cpp +++ b/libs/eigen/test/num_dimensions.cpp @@ -15,7 +15,6 @@ void check_dim(const Xpr& ) { STATIC_CHECK( Xpr::NumDimensions == ExpectedDim ); } -#if EIGEN_HAS_CXX11 template