ADD: new track message, Entity class and Position class
This commit is contained in:
20
libs/eigen/bench/tensors/README
Normal file
20
libs/eigen/bench/tensors/README
Normal file
@@ -0,0 +1,20 @@
|
||||
The tensor benchmark suite is made of several parts.
|
||||
|
||||
The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
|
||||
|
||||
To compile the floating point CPU benchmarks, simply call:
|
||||
g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
|
||||
|
||||
To compile the floating point GPU benchmarks, simply call:
|
||||
nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu
|
||||
|
||||
We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
|
||||
nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
|
||||
|
||||
To compile and run the benchmark for SYCL, using ComputeCpp, simply run the
|
||||
following commands:
|
||||
1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
|
||||
2. bash eigen_sycl_bench.sh
|
||||
|
||||
Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
|
||||
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
|
||||
49
libs/eigen/bench/tensors/benchmark.h
Normal file
49
libs/eigen/bench/tensors/benchmark.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright (C) 2012 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
|
||||
namespace testing {
|
||||
class Benchmark {
|
||||
public:
|
||||
Benchmark(const char* name, void (*fn)(int)) {
|
||||
Register(name, fn, NULL);
|
||||
}
|
||||
Benchmark(const char* name, void (*fn_range)(int, int)) {
|
||||
Register(name, NULL, fn_range);
|
||||
}
|
||||
Benchmark* Arg(int x);
|
||||
Benchmark* Range(int lo, int hi);
|
||||
const char* Name();
|
||||
bool ShouldRun(int argc, char* argv[]);
|
||||
void Run();
|
||||
private:
|
||||
const char* name_;
|
||||
void (*fn_)(int);
|
||||
void (*fn_range_)(int, int);
|
||||
std::vector<int> args_;
|
||||
void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
|
||||
void RunRepeatedlyWithArg(int iterations, int arg);
|
||||
void RunWithArg(int arg);
|
||||
};
|
||||
} // namespace testing
|
||||
void SetBenchmarkFlopsProcessed(int64_t);
|
||||
void StopBenchmarkTiming();
|
||||
void StartBenchmarkTiming();
|
||||
#define BENCHMARK(f) \
|
||||
static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
|
||||
(new ::testing::Benchmark(#f, f))
|
||||
237
libs/eigen/bench/tensors/benchmark_main.cc
Normal file
237
libs/eigen/bench/tensors/benchmark_main.cc
Normal file
@@ -0,0 +1,237 @@
|
||||
/*
|
||||
* Copyright (C) 2012 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "benchmark.h"
|
||||
#include <regex.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <inttypes.h>
|
||||
#include <time.h>
|
||||
#include <map>
|
||||
|
||||
static int64_t g_flops_processed;
|
||||
static int64_t g_benchmark_total_time_ns;
|
||||
static int64_t g_benchmark_start_time_ns;
|
||||
typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
|
||||
typedef BenchmarkMap::iterator BenchmarkMapIt;
|
||||
|
||||
BenchmarkMap& gBenchmarks() {
|
||||
static BenchmarkMap g_benchmarks;
|
||||
return g_benchmarks;
|
||||
}
|
||||
|
||||
static int g_name_column_width = 20;
|
||||
|
||||
static int Round(int n) {
|
||||
int base = 1;
|
||||
while (base*10 < n) {
|
||||
base *= 10;
|
||||
}
|
||||
if (n < 2*base) {
|
||||
return 2*base;
|
||||
}
|
||||
if (n < 5*base) {
|
||||
return 5*base;
|
||||
}
|
||||
return 10*base;
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <mach/mach_time.h>
|
||||
static mach_timebase_info_data_t g_time_info;
|
||||
static void __attribute__((constructor)) init_info() {
|
||||
mach_timebase_info(&g_time_info);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int64_t NanoTime() {
|
||||
#if defined(__APPLE__)
|
||||
uint64_t t = mach_absolute_time();
|
||||
return t * g_time_info.numer / g_time_info.denom;
|
||||
#else
|
||||
struct timespec t;
|
||||
t.tv_sec = t.tv_nsec = 0;
|
||||
clock_gettime(CLOCK_MONOTONIC, &t);
|
||||
return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace testing {
|
||||
Benchmark* Benchmark::Arg(int arg) {
|
||||
args_.push_back(arg);
|
||||
return this;
|
||||
}
|
||||
|
||||
Benchmark* Benchmark::Range(int lo, int hi) {
|
||||
const int kRangeMultiplier = 8;
|
||||
if (hi < lo) {
|
||||
int temp = hi;
|
||||
hi = lo;
|
||||
lo = temp;
|
||||
}
|
||||
while (lo < hi) {
|
||||
args_.push_back(lo);
|
||||
lo *= kRangeMultiplier;
|
||||
}
|
||||
// We always run the hi number.
|
||||
args_.push_back(hi);
|
||||
return this;
|
||||
}
|
||||
|
||||
const char* Benchmark::Name() {
|
||||
return name_;
|
||||
}
|
||||
bool Benchmark::ShouldRun(int argc, char* argv[]) {
|
||||
if (argc == 1) {
|
||||
return true; // With no arguments, we run all benchmarks.
|
||||
}
|
||||
// Otherwise, we interpret each argument as a regular expression and
|
||||
// see if any of our benchmarks match.
|
||||
for (int i = 1; i < argc; i++) {
|
||||
regex_t re;
|
||||
if (regcomp(&re, argv[i], 0) != 0) {
|
||||
fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
int match = regexec(&re, name_, 0, NULL, 0);
|
||||
regfree(&re);
|
||||
if (match != REG_NOMATCH) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
|
||||
name_ = name;
|
||||
fn_ = fn;
|
||||
fn_range_ = fn_range;
|
||||
if (fn_ == NULL && fn_range_ == NULL) {
|
||||
fprintf(stderr, "%s: missing function\n", name_);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
gBenchmarks().insert(std::make_pair(name, this));
|
||||
}
|
||||
void Benchmark::Run() {
|
||||
if (fn_ != NULL) {
|
||||
RunWithArg(0);
|
||||
} else {
|
||||
if (args_.empty()) {
|
||||
fprintf(stderr, "%s: no args!\n", name_);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
for (size_t i = 0; i < args_.size(); ++i) {
|
||||
RunWithArg(args_[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
|
||||
g_flops_processed = 0;
|
||||
g_benchmark_total_time_ns = 0;
|
||||
g_benchmark_start_time_ns = NanoTime();
|
||||
if (fn_ != NULL) {
|
||||
fn_(iterations);
|
||||
} else {
|
||||
fn_range_(iterations, arg);
|
||||
}
|
||||
if (g_benchmark_start_time_ns != 0) {
|
||||
g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
|
||||
}
|
||||
}
|
||||
void Benchmark::RunWithArg(int arg) {
|
||||
// run once in case it's expensive
|
||||
int iterations = 1;
|
||||
RunRepeatedlyWithArg(iterations, arg);
|
||||
while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
|
||||
int last = iterations;
|
||||
if (g_benchmark_total_time_ns/iterations == 0) {
|
||||
iterations = 1e9;
|
||||
} else {
|
||||
iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
|
||||
}
|
||||
iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
|
||||
iterations = Round(iterations);
|
||||
RunRepeatedlyWithArg(iterations, arg);
|
||||
}
|
||||
char throughput[100];
|
||||
throughput[0] = '\0';
|
||||
if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) {
|
||||
double mflops_processed = static_cast<double>(g_flops_processed)/1e6;
|
||||
double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
|
||||
snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds);
|
||||
}
|
||||
char full_name[100];
|
||||
if (fn_range_ != NULL) {
|
||||
if (arg >= (1<<20)) {
|
||||
snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
|
||||
} else if (arg >= (1<<10)) {
|
||||
snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
|
||||
} else {
|
||||
snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
|
||||
}
|
||||
} else {
|
||||
snprintf(full_name, sizeof(full_name), "%s", name_);
|
||||
}
|
||||
printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
|
||||
iterations, g_benchmark_total_time_ns/iterations, throughput);
|
||||
fflush(stdout);
|
||||
}
|
||||
} // namespace testing
|
||||
void SetBenchmarkFlopsProcessed(int64_t x) {
|
||||
g_flops_processed = x;
|
||||
}
|
||||
void StopBenchmarkTiming() {
|
||||
if (g_benchmark_start_time_ns != 0) {
|
||||
g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
|
||||
}
|
||||
g_benchmark_start_time_ns = 0;
|
||||
}
|
||||
void StartBenchmarkTiming() {
|
||||
if (g_benchmark_start_time_ns == 0) {
|
||||
g_benchmark_start_time_ns = NanoTime();
|
||||
}
|
||||
}
|
||||
int main(int argc, char* argv[]) {
|
||||
if (gBenchmarks().empty()) {
|
||||
fprintf(stderr, "No benchmarks registered!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
|
||||
int name_width = static_cast<int>(strlen(it->second->Name()));
|
||||
g_name_column_width = std::max(g_name_column_width, name_width);
|
||||
}
|
||||
bool need_header = true;
|
||||
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
|
||||
::testing::Benchmark* b = it->second;
|
||||
if (b->ShouldRun(argc, argv)) {
|
||||
if (need_header) {
|
||||
printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
|
||||
fflush(stdout);
|
||||
need_header = false;
|
||||
}
|
||||
b->Run();
|
||||
}
|
||||
}
|
||||
if (need_header) {
|
||||
fprintf(stderr, "No matching benchmarks!\n");
|
||||
fprintf(stderr, "Available benchmarks:\n");
|
||||
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
|
||||
fprintf(stderr, " %s\n", it->second->Name());
|
||||
}
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
39
libs/eigen/bench/tensors/contraction_benchmarks_cpu.cc
Normal file
39
libs/eigen/bench/tensors/contraction_benchmarks_cpu.cc
Normal file
@@ -0,0 +1,39 @@
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "tensor_benchmarks.h"
|
||||
|
||||
#define CREATE_THREAD_POOL(threads) \
|
||||
Eigen::ThreadPool pool(threads); \
|
||||
Eigen::ThreadPoolDevice device(&pool, threads);
|
||||
|
||||
|
||||
// Contractions for number of threads ranging from 1 to 32
|
||||
// Dimensions are Rows, Cols, Depth
|
||||
#define BM_ContractionCPU(D1, D2, D3) \
|
||||
static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \
|
||||
StopBenchmarkTiming(); \
|
||||
CREATE_THREAD_POOL(Threads); \
|
||||
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
|
||||
suite.contraction(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32);
|
||||
|
||||
|
||||
// Vector Matrix and Matrix Vector products
|
||||
BM_ContractionCPU(1, 2000, 500);
|
||||
BM_ContractionCPU(2000, 1, 500);
|
||||
|
||||
// Various skinny matrices
|
||||
BM_ContractionCPU(250, 3, 512);
|
||||
BM_ContractionCPU(1500, 3, 512);
|
||||
|
||||
BM_ContractionCPU(512, 800, 4);
|
||||
BM_ContractionCPU(512, 80, 800);
|
||||
BM_ContractionCPU(512, 80, 13522);
|
||||
BM_ContractionCPU(1, 80, 13522);
|
||||
|
||||
BM_ContractionCPU(3200, 512, 4);
|
||||
BM_ContractionCPU(3200, 512, 80);
|
||||
BM_ContractionCPU(3200, 80, 512);
|
||||
30
libs/eigen/bench/tensors/eigen_sycl_bench.sh
Executable file
30
libs/eigen/bench/tensors/eigen_sycl_bench.sh
Executable file
@@ -0,0 +1,30 @@
|
||||
rm -f tensor_benchmark_sycl
|
||||
: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
|
||||
echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
|
||||
${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \
|
||||
tensor_benchmarks_sycl.cc \
|
||||
benchmark_main.cc \
|
||||
-I ../../ \
|
||||
-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \
|
||||
-std=c++11 \
|
||||
-march=native \
|
||||
-O3 \
|
||||
-DNDEBUG \
|
||||
-DEIGEN_MPL2_ONLY \
|
||||
-DEIGEN_USE_SYCL=1 \
|
||||
-DEIGEN_SYCL_LOCAL_MEM=1 \
|
||||
-no-serial-memop \
|
||||
-mllvm \
|
||||
-inline-threshold=10000 \
|
||||
-fsycl-ih-last \
|
||||
-sycl-driver \
|
||||
-Xclang -cl-mad-enable \
|
||||
-lOpenCL \
|
||||
-lComputeCpp \
|
||||
-lpthread \
|
||||
-o \
|
||||
tensor_benchmark_sycl\
|
||||
${@:1}
|
||||
|
||||
export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
|
||||
./tensor_benchmark_sycl
|
||||
7
libs/eigen/bench/tensors/eigen_sycl_bench_contract.sh
Normal file
7
libs/eigen/bench/tensors/eigen_sycl_bench_contract.sh
Normal file
@@ -0,0 +1,7 @@
|
||||
rm -f tensor_contract_sycl_bench
|
||||
: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
|
||||
echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
|
||||
${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1}
|
||||
export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
|
||||
./tensor_contract_sycl_bench
|
||||
|
||||
597
libs/eigen/bench/tensors/tensor_benchmarks.h
Normal file
597
libs/eigen/bench/tensors/tensor_benchmarks.h
Normal file
@@ -0,0 +1,597 @@
|
||||
#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
||||
#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
||||
|
||||
typedef int TensorIndex;
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "benchmark.h"
|
||||
|
||||
#define BENCHMARK_RANGE(bench, lo, hi) \
|
||||
BENCHMARK(bench)->Range(lo, hi)
|
||||
|
||||
using Eigen::Tensor;
|
||||
using Eigen::TensorMap;
|
||||
|
||||
// TODO(bsteiner): also templatize on the input type since we have users
|
||||
// for int8 as well as floats.
|
||||
template <typename Device, typename T> class BenchmarkSuite {
|
||||
public:
|
||||
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
|
||||
: m_(m), k_(k), n_(n), device_(device) {
|
||||
initialize();
|
||||
}
|
||||
|
||||
BenchmarkSuite(const Device& device, size_t m)
|
||||
: m_(m), k_(m), n_(m), device_(device) {
|
||||
initialize();
|
||||
}
|
||||
|
||||
BenchmarkSuite(const Device& device, size_t m, size_t k)
|
||||
: m_(1), k_(k), n_(m), device_(device) {
|
||||
initialize();
|
||||
}
|
||||
|
||||
~BenchmarkSuite() {
|
||||
device_.deallocate(a_);
|
||||
device_.deallocate(b_);
|
||||
device_.deallocate(c_);
|
||||
}
|
||||
|
||||
void memcpy(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
|
||||
}
|
||||
// Record the number of values copied per second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
|
||||
}
|
||||
|
||||
void typeCasting(int num_iters) {
|
||||
eigen_assert(m_ == n_);
|
||||
Eigen::array<TensorIndex, 2> sizes;
|
||||
if (sizeof(T) >= sizeof(int)) {
|
||||
sizes[0] = m_;
|
||||
sizes[1] = k_;
|
||||
} else {
|
||||
sizes[0] = m_ * sizeof(T) / sizeof(int);
|
||||
sizes[1] = k_ * sizeof(T) / sizeof(int);
|
||||
}
|
||||
const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
|
||||
TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
B.device(device_) = A.template cast<T>();
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.template cast<T>();
|
||||
}
|
||||
// Record the number of values copied per second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
|
||||
}
|
||||
|
||||
void random(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
Eigen::array<TensorIndex, 2> sizes;
|
||||
sizes[0] = m_;
|
||||
sizes[1] = m_;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = C.random();
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = C.random();
|
||||
}
|
||||
// Record the number of random numbers generated per second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
|
||||
}
|
||||
|
||||
void slicing(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
Eigen::array<TensorIndex, 2> sizes;
|
||||
sizes[0] = m_;
|
||||
sizes[1] = m_;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
|
||||
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
|
||||
const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
|
||||
const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
|
||||
const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
|
||||
const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.slice(first_quadrant, quarter_sizes).device(device_) =
|
||||
A.slice(first_quadrant, quarter_sizes);
|
||||
C.slice(second_quadrant, quarter_sizes).device(device_) =
|
||||
B.slice(second_quadrant, quarter_sizes);
|
||||
C.slice(third_quadrant, quarter_sizes).device(device_) =
|
||||
A.slice(third_quadrant, quarter_sizes);
|
||||
C.slice(fourth_quadrant, quarter_sizes).device(device_) =
|
||||
B.slice(fourth_quadrant, quarter_sizes);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.slice(first_quadrant, quarter_sizes).device(device_) =
|
||||
A.slice(first_quadrant, quarter_sizes);
|
||||
C.slice(second_quadrant, quarter_sizes).device(device_) =
|
||||
B.slice(second_quadrant, quarter_sizes);
|
||||
C.slice(third_quadrant, quarter_sizes).device(device_) =
|
||||
A.slice(third_quadrant, quarter_sizes);
|
||||
C.slice(fourth_quadrant, quarter_sizes).device(device_) =
|
||||
B.slice(fourth_quadrant, quarter_sizes);
|
||||
}
|
||||
// Record the number of values copied from the rhs slice to the lhs slice
|
||||
// each second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
|
||||
}
|
||||
|
||||
void rowChip(int num_iters) {
|
||||
Eigen::array<TensorIndex, 2> input_size;
|
||||
input_size[0] = k_;
|
||||
input_size[1] = n_;
|
||||
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||
Eigen::array<TensorIndex, 1> output_size;
|
||||
output_size[0] = n_;
|
||||
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = B.chip(iter % k_, 0);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.chip(iter % k_, 0);
|
||||
}
|
||||
// Record the number of values copied from the rhs chip to the lhs.
|
||||
finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
|
||||
}
|
||||
|
||||
void colChip(int num_iters) {
|
||||
Eigen::array<TensorIndex, 2> input_size;
|
||||
input_size[0] = k_;
|
||||
input_size[1] = n_;
|
||||
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||
Eigen::array<TensorIndex, 1> output_size;
|
||||
output_size[0] = n_;
|
||||
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = B.chip(iter % n_, 1);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.chip(iter % n_, 1);
|
||||
}
|
||||
// Record the number of values copied from the rhs chip to the lhs.
|
||||
finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
|
||||
}
|
||||
|
||||
void shuffling(int num_iters) {
|
||||
eigen_assert(m_ == n_);
|
||||
Eigen::array<TensorIndex, 2> size_a;
|
||||
size_a[0] = m_;
|
||||
size_a[1] = k_;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
Eigen::array<TensorIndex, 2> size_b;
|
||||
size_b[0] = k_;
|
||||
size_b[1] = m_;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
|
||||
|
||||
Eigen::array<int, 2> shuffle;
|
||||
shuffle[0] = 1;
|
||||
shuffle[1] = 0;
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
B.device(device_) = A.shuffle(shuffle);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.shuffle(shuffle);
|
||||
}
|
||||
// Record the number of values shuffled from A and copied to B each second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
|
||||
}
|
||||
|
||||
void padding(int num_iters) {
|
||||
eigen_assert(m_ == k_);
|
||||
Eigen::array<TensorIndex, 2> size_a;
|
||||
size_a[0] = m_;
|
||||
size_a[1] = k_-3;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
Eigen::array<TensorIndex, 2> size_b;
|
||||
size_b[0] = k_;
|
||||
size_b[1] = m_;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
|
||||
|
||||
#if defined(EIGEN_HAS_INDEX_LIST)
|
||||
Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
|
||||
Eigen::type2indexpair<2, 1> > paddings;
|
||||
#else
|
||||
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
|
||||
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
|
||||
paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
|
||||
#endif
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
B.device(device_) = A.pad(paddings);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.pad(paddings);
|
||||
}
|
||||
// Record the number of values copied from the padded tensor A each second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
|
||||
}
|
||||
|
||||
void striding(int num_iters) {
|
||||
eigen_assert(m_ == k_);
|
||||
Eigen::array<TensorIndex, 2> size_a;
|
||||
size_a[0] = m_;
|
||||
size_a[1] = k_;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
Eigen::array<TensorIndex, 2> size_b;
|
||||
size_b[0] = m_;
|
||||
size_b[1] = k_/2;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
Eigen::array<TensorIndex, 2> strides;
|
||||
strides[0] = 1;
|
||||
strides[1] = 2;
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
B.device(device_) = A.stride(strides);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.stride(strides);
|
||||
}
|
||||
// Record the number of values copied from the padded tensor A each second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
|
||||
}
|
||||
|
||||
|
||||
void broadcasting(int num_iters) {
|
||||
Eigen::array<TensorIndex, 2> size_a;
|
||||
size_a[0] = m_;
|
||||
size_a[1] = 1;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
Eigen::array<TensorIndex, 2> size_c;
|
||||
size_c[0] = m_;
|
||||
size_c[1] = n_;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
Eigen::array<int, 2> broadcast;
|
||||
broadcast[0] = 1;
|
||||
broadcast[1] = n_;
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
|
||||
broadcast.set(1, n_);
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.broadcast(broadcast);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.broadcast(broadcast);
|
||||
}
|
||||
// Record the number of values broadcasted from A and copied to C each second
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
|
||||
}
|
||||
|
||||
void coeffWiseOp(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
Eigen::array<TensorIndex, 2> sizes;
|
||||
sizes[0] = m_;
|
||||
sizes[1] = m_;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
|
||||
}
|
||||
// Record the number of FLOP executed per second (2 multiplications and
|
||||
// 1 addition per value)
|
||||
finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void algebraicFunc(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
Eigen::array<TensorIndex, 2> sizes;
|
||||
sizes[0] = m_;
|
||||
sizes[1] = m_;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
|
||||
}
|
||||
|
||||
void transcendentalFunc(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
Eigen::array<TensorIndex, 2> sizes;
|
||||
sizes[0] = m_;
|
||||
sizes[1] = m_;
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.exp() + B.log();
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.exp() + B.log();
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
|
||||
}
|
||||
|
||||
// Row reduction
|
||||
void rowReduction(int num_iters) {
|
||||
Eigen::array<TensorIndex, 2> input_size;
|
||||
input_size[0] = k_;
|
||||
input_size[1] = n_;
|
||||
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||
Eigen::array<TensorIndex, 1> output_size;
|
||||
output_size[0] = n_;
|
||||
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
Eigen::array<TensorIndex, 1> sum_along_dim;
|
||||
sum_along_dim[0] = 0;
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
|
||||
#endif
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = B.sum(sum_along_dim);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.sum(sum_along_dim);
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
|
||||
}
|
||||
|
||||
// Column reduction
|
||||
void colReduction(int num_iters) {
|
||||
Eigen::array<TensorIndex, 2> input_size;
|
||||
input_size[0] = k_;
|
||||
input_size[1] = n_;
|
||||
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
|
||||
b_, input_size);
|
||||
Eigen::array<TensorIndex, 1> output_size;
|
||||
output_size[0] = k_;
|
||||
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
|
||||
a_, output_size);
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
Eigen::array<TensorIndex, 1> sum_along_dim;
|
||||
sum_along_dim[0] = 1;
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
|
||||
#endif
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
A.device(device_) = B.sum(sum_along_dim);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
A.device(device_) = B.sum(sum_along_dim);
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
|
||||
}
|
||||
|
||||
// Full reduction
|
||||
void fullReduction(int num_iters) {
|
||||
Eigen::array<TensorIndex, 2> input_size;
|
||||
input_size[0] = k_;
|
||||
input_size[1] = n_;
|
||||
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
|
||||
b_, input_size);
|
||||
Eigen::array<TensorIndex, 0> output_size;
|
||||
TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
|
||||
c_, output_size);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = B.sum();
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.sum();
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// do a contraction which is equivalent to a matrix multiplication
|
||||
void contraction(int num_iters) {
|
||||
contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
|
||||
}
|
||||
|
||||
void contractionRowMajor(int num_iters) {
|
||||
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
|
||||
}
|
||||
|
||||
void contractionRowMajorAT(int num_iters) {
|
||||
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
|
||||
}
|
||||
|
||||
void contractionRowMajorBT(int num_iters) {
|
||||
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
|
||||
}
|
||||
|
||||
void contractionRowMajorABT(int num_iters) {
|
||||
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
|
||||
}
|
||||
|
||||
void convolution(int num_iters, int kernel_x, int kernel_y) {
|
||||
Eigen::array<TensorIndex, 2> input_sizes;
|
||||
input_sizes[0] = m_;
|
||||
input_sizes[1] = n_;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
|
||||
Eigen::array<TensorIndex, 2> kernel_sizes;
|
||||
kernel_sizes[0] = kernel_x;
|
||||
kernel_sizes[1] = kernel_y;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
|
||||
Eigen::array<TensorIndex, 2> result_sizes;
|
||||
result_sizes[0] = m_ - kernel_x + 1;
|
||||
result_sizes[1] = n_ - kernel_y + 1;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
|
||||
Eigen::array<TensorIndex, 2> dims;
|
||||
dims[0] = 0;
|
||||
dims[1] = 1;
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.convolve(B, dims);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.convolve(B, dims);
|
||||
}
|
||||
// Record the number of FLOPs executed per second (kernel_size
|
||||
// multiplications and additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(static_cast<int64_t>(2) *
|
||||
(m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
|
||||
}
|
||||
|
||||
private:
|
||||
// do a contraction which is equivalent to a matrix multiplication
|
||||
template<int Layout>
|
||||
void contraction(int num_iters, bool trans_a, bool trans_b) {
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = (trans_a ? k_: m_);
|
||||
sizeA[1] = (trans_a ? m_: k_);
|
||||
Eigen::array<TensorIndex, 2> sizeB;
|
||||
sizeB[0] = (trans_b ? n_: k_);
|
||||
sizeB[1] = (trans_b ? k_: n_);
|
||||
Eigen::array<TensorIndex, 2> sizeC;
|
||||
sizeC[0] = m_;
|
||||
sizeC[1] = n_;
|
||||
|
||||
const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
|
||||
const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
|
||||
TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
|
||||
|
||||
typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims;
|
||||
TensorIndex a_contract_dim = (trans_a ? 0 : 1);
|
||||
TensorIndex b_contract_dim = (trans_b ? 1 : 0);
|
||||
dims[0] = DimPair(a_contract_dim, b_contract_dim);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
#endif
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
// Record the number of FLOP executed per second (size_ multiplications and
|
||||
// additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
|
||||
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
|
||||
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||
|
||||
}
|
||||
|
||||
inline void finalizeBenchmark(int64_t num_items) {
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
|
||||
device_.synchronize();
|
||||
}
|
||||
#elif defined(EIGEN_USE_SYCL)
|
||||
if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
|
||||
device_.synchronize();
|
||||
}
|
||||
|
||||
#endif
|
||||
StopBenchmarkTiming();
|
||||
SetBenchmarkFlopsProcessed(num_items);
|
||||
}
|
||||
|
||||
|
||||
TensorIndex m_;
|
||||
TensorIndex k_;
|
||||
TensorIndex n_;
|
||||
T* a_;
|
||||
T* b_;
|
||||
T* c_;
|
||||
Device device_;
|
||||
};
|
||||
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
||||
168
libs/eigen/bench/tensors/tensor_benchmarks_cpu.cc
Normal file
168
libs/eigen/bench/tensors/tensor_benchmarks_cpu.cc
Normal file
@@ -0,0 +1,168 @@
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "tensor_benchmarks.h"
|
||||
|
||||
#define CREATE_THREAD_POOL(threads) \
|
||||
Eigen::ThreadPool pool(threads); \
|
||||
Eigen::ThreadPoolDevice device(&pool, threads);
|
||||
|
||||
// Simple functions
|
||||
#define BM_FuncCPU(FUNC, THREADS) \
|
||||
static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
CREATE_THREAD_POOL(THREADS); \
|
||||
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
|
||||
|
||||
BM_FuncCPU(memcpy, 4);
|
||||
BM_FuncCPU(memcpy, 8);
|
||||
BM_FuncCPU(memcpy, 12);
|
||||
|
||||
BM_FuncCPU(typeCasting, 4);
|
||||
BM_FuncCPU(typeCasting, 8);
|
||||
BM_FuncCPU(typeCasting, 12);
|
||||
|
||||
BM_FuncCPU(random, 4);
|
||||
BM_FuncCPU(random, 8);
|
||||
BM_FuncCPU(random, 12);
|
||||
|
||||
BM_FuncCPU(slicing, 4);
|
||||
BM_FuncCPU(slicing, 8);
|
||||
BM_FuncCPU(slicing, 12);
|
||||
|
||||
BM_FuncCPU(rowChip, 4);
|
||||
BM_FuncCPU(rowChip, 8);
|
||||
BM_FuncCPU(rowChip, 12);
|
||||
|
||||
BM_FuncCPU(colChip, 4);
|
||||
BM_FuncCPU(colChip, 8);
|
||||
BM_FuncCPU(colChip, 12);
|
||||
|
||||
BM_FuncCPU(shuffling, 4);
|
||||
BM_FuncCPU(shuffling, 8);
|
||||
BM_FuncCPU(shuffling, 12);
|
||||
|
||||
BM_FuncCPU(padding, 4);
|
||||
BM_FuncCPU(padding, 8);
|
||||
BM_FuncCPU(padding, 12);
|
||||
|
||||
BM_FuncCPU(striding, 4);
|
||||
BM_FuncCPU(striding, 8);
|
||||
BM_FuncCPU(striding, 12);
|
||||
|
||||
BM_FuncCPU(broadcasting, 4);
|
||||
BM_FuncCPU(broadcasting, 8);
|
||||
BM_FuncCPU(broadcasting, 12);
|
||||
|
||||
BM_FuncCPU(coeffWiseOp, 4);
|
||||
BM_FuncCPU(coeffWiseOp, 8);
|
||||
BM_FuncCPU(coeffWiseOp, 12);
|
||||
|
||||
BM_FuncCPU(algebraicFunc, 4);
|
||||
BM_FuncCPU(algebraicFunc, 8);
|
||||
BM_FuncCPU(algebraicFunc, 12);
|
||||
|
||||
BM_FuncCPU(transcendentalFunc, 4);
|
||||
BM_FuncCPU(transcendentalFunc, 8);
|
||||
BM_FuncCPU(transcendentalFunc, 12);
|
||||
|
||||
BM_FuncCPU(rowReduction, 4);
|
||||
BM_FuncCPU(rowReduction, 8);
|
||||
BM_FuncCPU(rowReduction, 12);
|
||||
|
||||
BM_FuncCPU(colReduction, 4);
|
||||
BM_FuncCPU(colReduction, 8);
|
||||
BM_FuncCPU(colReduction, 12);
|
||||
|
||||
|
||||
// Contractions
|
||||
#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
|
||||
static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
if (THREADS == 1) { \
|
||||
Eigen::DefaultDevice device; \
|
||||
BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \
|
||||
suite.FUNC(iters); \
|
||||
} else { \
|
||||
CREATE_THREAD_POOL(THREADS); \
|
||||
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
|
||||
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
|
||||
|
||||
|
||||
// Convolutions
|
||||
#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \
|
||||
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
CREATE_THREAD_POOL(THREADS); \
|
||||
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
|
||||
suite.FUNC(iters, DIM1, DIM2); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
|
||||
77
libs/eigen/bench/tensors/tensor_benchmarks_fp16_gpu.cu
Normal file
77
libs/eigen/bench/tensors/tensor_benchmarks_fp16_gpu.cu
Normal file
@@ -0,0 +1,77 @@
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
#include "tensor_benchmarks.h"
|
||||
|
||||
// Simple functions
|
||||
#define BM_FuncGPU(FUNC) \
|
||||
static void BM_##FUNC(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
||||
|
||||
BM_FuncGPU(memcpy);
|
||||
BM_FuncGPU(typeCasting);
|
||||
//BM_FuncGPU(random);
|
||||
BM_FuncGPU(slicing);
|
||||
BM_FuncGPU(rowChip);
|
||||
BM_FuncGPU(colChip);
|
||||
BM_FuncGPU(shuffling);
|
||||
BM_FuncGPU(padding);
|
||||
BM_FuncGPU(striding);
|
||||
BM_FuncGPU(broadcasting);
|
||||
BM_FuncGPU(coeffWiseOp);
|
||||
BM_FuncGPU(algebraicFunc);
|
||||
BM_FuncGPU(transcendentalFunc);
|
||||
BM_FuncGPU(rowReduction);
|
||||
BM_FuncGPU(colReduction);
|
||||
BM_FuncGPU(fullReduction);
|
||||
|
||||
|
||||
// Contractions
|
||||
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
|
||||
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
|
||||
|
||||
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
||||
|
||||
|
||||
// Convolutions
|
||||
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
|
||||
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters, DIM1, DIM2); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
|
||||
|
||||
/*
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
|
||||
*/
|
||||
75
libs/eigen/bench/tensors/tensor_benchmarks_gpu.cu
Normal file
75
libs/eigen/bench/tensors/tensor_benchmarks_gpu.cu
Normal file
@@ -0,0 +1,75 @@
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
#include "tensor_benchmarks.h"
|
||||
|
||||
// Simple functions
|
||||
#define BM_FuncGPU(FUNC) \
|
||||
static void BM_##FUNC(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
||||
|
||||
BM_FuncGPU(memcpy);
|
||||
BM_FuncGPU(typeCasting);
|
||||
BM_FuncGPU(random);
|
||||
BM_FuncGPU(slicing);
|
||||
BM_FuncGPU(rowChip);
|
||||
BM_FuncGPU(colChip);
|
||||
BM_FuncGPU(shuffling);
|
||||
BM_FuncGPU(padding);
|
||||
BM_FuncGPU(striding);
|
||||
BM_FuncGPU(broadcasting);
|
||||
BM_FuncGPU(coeffWiseOp);
|
||||
BM_FuncGPU(algebraicFunc);
|
||||
BM_FuncGPU(transcendentalFunc);
|
||||
BM_FuncGPU(rowReduction);
|
||||
BM_FuncGPU(colReduction);
|
||||
BM_FuncGPU(fullReduction);
|
||||
|
||||
|
||||
// Contractions
|
||||
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
|
||||
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
|
||||
|
||||
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
||||
|
||||
|
||||
// Convolutions
|
||||
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
|
||||
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::CudaStreamDevice stream; \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters, DIM1, DIM2); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
|
||||
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
|
||||
140
libs/eigen/bench/tensors/tensor_benchmarks_sycl.cc
Normal file
140
libs/eigen/bench/tensors/tensor_benchmarks_sycl.cc
Normal file
@@ -0,0 +1,140 @@
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
#include <iostream>
|
||||
|
||||
#include "tensor_benchmarks.h"
|
||||
|
||||
cl::sycl::gpu_selector selector;
|
||||
Eigen::QueueInterface queue(selector);
|
||||
#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2) \
|
||||
static void BM_##FUNC##_##D1##x##D2(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::SyclDevice device(&queue); \
|
||||
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10);
|
||||
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568);
|
||||
BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568);
|
||||
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 100352, 256);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 100352, 64);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 25088, 512);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 6272, 102);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 25088, 128);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 6272, 256);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 1568, 204);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 1568, 512);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024);
|
||||
BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 256, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 512, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 64, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 128, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1);
|
||||
BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1);
|
||||
#define BM_FuncGPU(FUNC) \
|
||||
static void BM_##FUNC(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::SyclDevice device(&queue); \
|
||||
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
||||
|
||||
BM_FuncGPU(rowReduction);
|
||||
BM_FuncGPU(colReduction);
|
||||
BM_FuncGPU(fullReduction);
|
||||
|
||||
BM_FuncGPU(memcpy);
|
||||
BM_FuncGPU(typeCasting);
|
||||
BM_FuncGPU(random);
|
||||
BM_FuncGPU(slicing);
|
||||
BM_FuncGPU(rowChip);
|
||||
BM_FuncGPU(colChip);
|
||||
BM_FuncGPU(shuffling);
|
||||
BM_FuncGPU(padding);
|
||||
BM_FuncGPU(striding);
|
||||
BM_FuncGPU(broadcasting);
|
||||
BM_FuncGPU(coeffWiseOp);
|
||||
BM_FuncGPU(algebraicFunc);
|
||||
BM_FuncGPU(transcendentalFunc);
|
||||
// Contractions
|
||||
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
|
||||
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::SyclDevice device(&queue); \
|
||||
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
|
||||
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
||||
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64);
|
||||
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64);
|
||||
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64);
|
||||
|
||||
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64);
|
||||
|
||||
// Convolutions
|
||||
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
|
||||
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
Eigen::SyclDevice device(&queue); \
|
||||
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
|
||||
suite.FUNC(iters, DIM1, DIM2); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
|
||||
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
|
||||
#endif
|
||||
325
libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc
Normal file
325
libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc
Normal file
@@ -0,0 +1,325 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#ifndef EIGEN_BENCH_CONTRACT_SYCL
|
||||
#define EIGEN_BENCH_CONTRACT_SYCL
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
|
||||
#include <SYCL/sycl.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::array;
|
||||
using Eigen::SyclDevice;
|
||||
using Eigen::Tensor;
|
||||
using Eigen::TensorMap;
|
||||
std::ofstream out("Result.txt");
|
||||
|
||||
std::chrono::time_point<std::chrono::system_clock> get_time(){
|
||||
std::chrono::time_point<std::chrono::system_clock> start, end;
|
||||
return std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
template<typename Start, typename End, typename TensorIndex>
|
||||
void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
|
||||
|
||||
std::chrono::duration<double> elapsed_seconds = end-start;
|
||||
std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
|
||||
static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
|
||||
out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
|
||||
static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
|
||||
}
|
||||
|
||||
// do a contraction which is equivalent to a matrix multiplication
|
||||
template<typename T, typename Device, typename TensorIndex>
|
||||
void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
|
||||
T* a_;
|
||||
T* b_;
|
||||
T* c_;
|
||||
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
|
||||
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
|
||||
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = m_;
|
||||
sizeA[1] = k_;
|
||||
Eigen::array<TensorIndex, 2> sizeB;
|
||||
sizeB[0] = k_;
|
||||
sizeB[1] = n_;
|
||||
Eigen::array<TensorIndex, 2> sizeC;
|
||||
sizeC[0] = m_;
|
||||
sizeC[1] = n_;
|
||||
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
|
||||
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
|
||||
|
||||
typedef typename Tensor<T, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims;
|
||||
dims[0] = DimPair(1, 0);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
#endif
|
||||
auto start = get_time();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
auto end = get_time();
|
||||
// Record the number of FLOPs executed per second (size_ multiplications and
|
||||
// additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
|
||||
device_.deallocate(a_);
|
||||
device_.deallocate(b_);
|
||||
device_.deallocate(c_);
|
||||
device_.synchronize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
// do a contraction which is equivalent to a matrix multiplication
|
||||
template<typename T, typename Device, typename TensorIndex>
|
||||
void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
|
||||
T* a_;
|
||||
T* b_;
|
||||
T* c_;
|
||||
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
|
||||
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
|
||||
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = m_;
|
||||
sizeA[1] = k_;
|
||||
Eigen::array<TensorIndex, 2> sizeB;
|
||||
sizeB[0] = k_;
|
||||
sizeB[1] = n_;
|
||||
Eigen::array<TensorIndex, 2> sizeC;
|
||||
sizeC[0] = m_;
|
||||
sizeC[1] = n_;
|
||||
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
|
||||
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
|
||||
|
||||
typedef typename Tensor<T, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims;
|
||||
dims[0] = DimPair(1, 0);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
#endif
|
||||
auto start = get_time();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
auto end = get_time();
|
||||
// Record the number of FLOPs executed per second (size_ multiplications and
|
||||
// additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
|
||||
device_.deallocate(a_);
|
||||
device_.deallocate(b_);
|
||||
device_.deallocate(c_);
|
||||
device_.synchronize();
|
||||
}
|
||||
|
||||
|
||||
template<typename T, typename Device, typename TensorIndex>
|
||||
void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
|
||||
T* a_;
|
||||
T* b_;
|
||||
T* c_;
|
||||
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
|
||||
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
|
||||
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = k_;
|
||||
sizeA[1] = m_;
|
||||
Eigen::array<TensorIndex, 2> sizeB;
|
||||
sizeB[0] = k_;
|
||||
sizeB[1] = n_;
|
||||
Eigen::array<TensorIndex, 2> sizeC;
|
||||
sizeC[0] = m_;
|
||||
sizeC[1] = n_;
|
||||
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
|
||||
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
|
||||
|
||||
typedef typename Tensor<T, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims;
|
||||
dims[0] = DimPair(0, 0);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
#endif
|
||||
auto start = get_time();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
auto end = get_time();
|
||||
// Record the number of FLOPs executed per second (size_ multiplications and
|
||||
// additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
|
||||
device_.deallocate(a_);
|
||||
device_.deallocate(b_);
|
||||
device_.deallocate(c_);
|
||||
device_.synchronize();
|
||||
|
||||
}
|
||||
|
||||
template<typename T, typename Device, typename TensorIndex>
|
||||
void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
|
||||
T* a_;
|
||||
T* b_;
|
||||
T* c_;
|
||||
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
|
||||
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
|
||||
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = m_;
|
||||
sizeA[1] = k_;
|
||||
Eigen::array<TensorIndex, 2> sizeB;
|
||||
sizeB[0] = n_;
|
||||
sizeB[1] = k_;
|
||||
Eigen::array<TensorIndex, 2> sizeC;
|
||||
sizeC[0] = m_;
|
||||
sizeC[1] = n_;
|
||||
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
|
||||
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
|
||||
|
||||
typedef typename Tensor<T, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims;
|
||||
dims[0] = DimPair(1, 1);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
#endif
|
||||
auto start = get_time();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
auto end = get_time();
|
||||
// Record the number of FLOPs executed per second (size_ multiplications and
|
||||
// additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
|
||||
device_.deallocate(a_);
|
||||
device_.deallocate(b_);
|
||||
device_.deallocate(c_);
|
||||
device_.synchronize();
|
||||
|
||||
}
|
||||
|
||||
template<typename T, typename Device, typename TensorIndex>
|
||||
void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
|
||||
T* a_;
|
||||
T* b_;
|
||||
T* c_;
|
||||
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
|
||||
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
|
||||
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = k_;
|
||||
sizeA[1] = m_;
|
||||
Eigen::array<TensorIndex, 2> sizeB;
|
||||
sizeB[0] = n_;
|
||||
sizeB[1] = k_;
|
||||
Eigen::array<TensorIndex, 2> sizeC;
|
||||
sizeC[0] = m_;
|
||||
sizeC[1] = n_;
|
||||
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
|
||||
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
|
||||
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
|
||||
|
||||
typedef typename Tensor<T, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims;
|
||||
dims[0] = DimPair(0, 1);
|
||||
#ifdef EIGEN_USE_SYCL // warmup for sycl
|
||||
for (int iter = 0; iter < 10; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
#endif
|
||||
auto start = get_time();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
auto end = get_time();
|
||||
// Record the number of FLOPs executed per second (size_ multiplications and
|
||||
// additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
|
||||
device_.deallocate(a_);
|
||||
device_.deallocate(b_);
|
||||
device_.deallocate(c_);
|
||||
device_.synchronize();
|
||||
}
|
||||
|
||||
int main() {
|
||||
cl::sycl::gpu_selector selector;
|
||||
Eigen::QueueInterface queue(selector);
|
||||
Eigen::SyclDevice device(&queue);
|
||||
int64_t num_iters =20;
|
||||
for(int64_t m = 32; m <= 4096; m *= 2)
|
||||
for(int64_t k = 32; k <= 4096; k *= 2)
|
||||
for(int64_t n = 32; n <= 4096; n*= 2){
|
||||
(contraction<float>(device, num_iters, m, k, n));
|
||||
(contractionRowMajor<float>(device, num_iters, m, k, n));
|
||||
(contractionAT<float>(device, num_iters, m, k, n));
|
||||
(contractionBT<float>(device, num_iters, m, k, n));
|
||||
(contractionABT<float>(device, num_iters, m, k, n));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // EIGEN_BENCH_CONTRACT_SYCL
|
||||
Reference in New Issue
Block a user