ADD: new track message, Entity class and Position class

This commit is contained in:
Henry Winkel
2022-12-20 17:20:35 +01:00
parent 469ecfb099
commit 98ebb563a8
2114 changed files with 482360 additions and 24 deletions

View File

@@ -0,0 +1,20 @@
The tensor benchmark suite is made of several parts.
The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
To compile the floating point CPU benchmarks, simply call:
g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
To compile the floating point GPU benchmarks, simply call:
nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu
We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
To compile and run the benchmark for SYCL, using ComputeCpp, simply run the
following commands:
1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
2. bash eigen_sycl_bench.sh
Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu

View File

@@ -0,0 +1,49 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stddef.h>
#include <stdint.h>
#include <vector>
namespace testing {
class Benchmark {
public:
Benchmark(const char* name, void (*fn)(int)) {
Register(name, fn, NULL);
}
Benchmark(const char* name, void (*fn_range)(int, int)) {
Register(name, NULL, fn_range);
}
Benchmark* Arg(int x);
Benchmark* Range(int lo, int hi);
const char* Name();
bool ShouldRun(int argc, char* argv[]);
void Run();
private:
const char* name_;
void (*fn_)(int);
void (*fn_range_)(int, int);
std::vector<int> args_;
void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
void RunRepeatedlyWithArg(int iterations, int arg);
void RunWithArg(int arg);
};
} // namespace testing
void SetBenchmarkFlopsProcessed(int64_t);
void StopBenchmarkTiming();
void StartBenchmarkTiming();
#define BENCHMARK(f) \
static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
(new ::testing::Benchmark(#f, f))

View File

@@ -0,0 +1,237 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "benchmark.h"
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <inttypes.h>
#include <time.h>
#include <map>
static int64_t g_flops_processed;
static int64_t g_benchmark_total_time_ns;
static int64_t g_benchmark_start_time_ns;
typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
typedef BenchmarkMap::iterator BenchmarkMapIt;
BenchmarkMap& gBenchmarks() {
static BenchmarkMap g_benchmarks;
return g_benchmarks;
}
static int g_name_column_width = 20;
static int Round(int n) {
int base = 1;
while (base*10 < n) {
base *= 10;
}
if (n < 2*base) {
return 2*base;
}
if (n < 5*base) {
return 5*base;
}
return 10*base;
}
#ifdef __APPLE__
#include <mach/mach_time.h>
static mach_timebase_info_data_t g_time_info;
static void __attribute__((constructor)) init_info() {
mach_timebase_info(&g_time_info);
}
#endif
static int64_t NanoTime() {
#if defined(__APPLE__)
uint64_t t = mach_absolute_time();
return t * g_time_info.numer / g_time_info.denom;
#else
struct timespec t;
t.tv_sec = t.tv_nsec = 0;
clock_gettime(CLOCK_MONOTONIC, &t);
return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
#endif
}
namespace testing {
Benchmark* Benchmark::Arg(int arg) {
args_.push_back(arg);
return this;
}
Benchmark* Benchmark::Range(int lo, int hi) {
const int kRangeMultiplier = 8;
if (hi < lo) {
int temp = hi;
hi = lo;
lo = temp;
}
while (lo < hi) {
args_.push_back(lo);
lo *= kRangeMultiplier;
}
// We always run the hi number.
args_.push_back(hi);
return this;
}
const char* Benchmark::Name() {
return name_;
}
bool Benchmark::ShouldRun(int argc, char* argv[]) {
if (argc == 1) {
return true; // With no arguments, we run all benchmarks.
}
// Otherwise, we interpret each argument as a regular expression and
// see if any of our benchmarks match.
for (int i = 1; i < argc; i++) {
regex_t re;
if (regcomp(&re, argv[i], 0) != 0) {
fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
exit(EXIT_FAILURE);
}
int match = regexec(&re, name_, 0, NULL, 0);
regfree(&re);
if (match != REG_NOMATCH) {
return true;
}
}
return false;
}
void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
name_ = name;
fn_ = fn;
fn_range_ = fn_range;
if (fn_ == NULL && fn_range_ == NULL) {
fprintf(stderr, "%s: missing function\n", name_);
exit(EXIT_FAILURE);
}
gBenchmarks().insert(std::make_pair(name, this));
}
void Benchmark::Run() {
if (fn_ != NULL) {
RunWithArg(0);
} else {
if (args_.empty()) {
fprintf(stderr, "%s: no args!\n", name_);
exit(EXIT_FAILURE);
}
for (size_t i = 0; i < args_.size(); ++i) {
RunWithArg(args_[i]);
}
}
}
void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
g_flops_processed = 0;
g_benchmark_total_time_ns = 0;
g_benchmark_start_time_ns = NanoTime();
if (fn_ != NULL) {
fn_(iterations);
} else {
fn_range_(iterations, arg);
}
if (g_benchmark_start_time_ns != 0) {
g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
}
}
void Benchmark::RunWithArg(int arg) {
// run once in case it's expensive
int iterations = 1;
RunRepeatedlyWithArg(iterations, arg);
while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
int last = iterations;
if (g_benchmark_total_time_ns/iterations == 0) {
iterations = 1e9;
} else {
iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
}
iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
iterations = Round(iterations);
RunRepeatedlyWithArg(iterations, arg);
}
char throughput[100];
throughput[0] = '\0';
if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) {
double mflops_processed = static_cast<double>(g_flops_processed)/1e6;
double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds);
}
char full_name[100];
if (fn_range_ != NULL) {
if (arg >= (1<<20)) {
snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
} else if (arg >= (1<<10)) {
snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
} else {
snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
}
} else {
snprintf(full_name, sizeof(full_name), "%s", name_);
}
printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
iterations, g_benchmark_total_time_ns/iterations, throughput);
fflush(stdout);
}
} // namespace testing
void SetBenchmarkFlopsProcessed(int64_t x) {
g_flops_processed = x;
}
void StopBenchmarkTiming() {
if (g_benchmark_start_time_ns != 0) {
g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
}
g_benchmark_start_time_ns = 0;
}
void StartBenchmarkTiming() {
if (g_benchmark_start_time_ns == 0) {
g_benchmark_start_time_ns = NanoTime();
}
}
int main(int argc, char* argv[]) {
if (gBenchmarks().empty()) {
fprintf(stderr, "No benchmarks registered!\n");
exit(EXIT_FAILURE);
}
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
int name_width = static_cast<int>(strlen(it->second->Name()));
g_name_column_width = std::max(g_name_column_width, name_width);
}
bool need_header = true;
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
::testing::Benchmark* b = it->second;
if (b->ShouldRun(argc, argv)) {
if (need_header) {
printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
fflush(stdout);
need_header = false;
}
b->Run();
}
}
if (need_header) {
fprintf(stderr, "No matching benchmarks!\n");
fprintf(stderr, "Available benchmarks:\n");
for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
fprintf(stderr, " %s\n", it->second->Name());
}
exit(EXIT_FAILURE);
}
return 0;
}

View File

@@ -0,0 +1,39 @@
#define EIGEN_USE_THREADS
#include <string>
#include "tensor_benchmarks.h"
#define CREATE_THREAD_POOL(threads) \
Eigen::ThreadPool pool(threads); \
Eigen::ThreadPoolDevice device(&pool, threads);
// Contractions for number of threads ranging from 1 to 32
// Dimensions are Rows, Cols, Depth
#define BM_ContractionCPU(D1, D2, D3) \
static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(Threads); \
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
suite.contraction(iters); \
} \
BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32);
// Vector Matrix and Matrix Vector products
BM_ContractionCPU(1, 2000, 500);
BM_ContractionCPU(2000, 1, 500);
// Various skinny matrices
BM_ContractionCPU(250, 3, 512);
BM_ContractionCPU(1500, 3, 512);
BM_ContractionCPU(512, 800, 4);
BM_ContractionCPU(512, 80, 800);
BM_ContractionCPU(512, 80, 13522);
BM_ContractionCPU(1, 80, 13522);
BM_ContractionCPU(3200, 512, 4);
BM_ContractionCPU(3200, 512, 80);
BM_ContractionCPU(3200, 80, 512);

View File

@@ -0,0 +1,30 @@
rm -f tensor_benchmark_sycl
: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \
tensor_benchmarks_sycl.cc \
benchmark_main.cc \
-I ../../ \
-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \
-std=c++11 \
-march=native \
-O3 \
-DNDEBUG \
-DEIGEN_MPL2_ONLY \
-DEIGEN_USE_SYCL=1 \
-DEIGEN_SYCL_LOCAL_MEM=1 \
-no-serial-memop \
-mllvm \
-inline-threshold=10000 \
-fsycl-ih-last \
-sycl-driver \
-Xclang -cl-mad-enable \
-lOpenCL \
-lComputeCpp \
-lpthread \
-o \
tensor_benchmark_sycl\
${@:1}
export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
./tensor_benchmark_sycl

View File

@@ -0,0 +1,7 @@
rm -f tensor_contract_sycl_bench
: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1}
export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
./tensor_contract_sycl_bench

View File

@@ -0,0 +1,597 @@
#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
typedef int TensorIndex;
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#include "unsupported/Eigen/CXX11/Tensor"
#include "benchmark.h"
#define BENCHMARK_RANGE(bench, lo, hi) \
BENCHMARK(bench)->Range(lo, hi)
using Eigen::Tensor;
using Eigen::TensorMap;
// TODO(bsteiner): also templatize on the input type since we have users
// for int8 as well as floats.
template <typename Device, typename T> class BenchmarkSuite {
public:
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
: m_(m), k_(k), n_(n), device_(device) {
initialize();
}
BenchmarkSuite(const Device& device, size_t m)
: m_(m), k_(m), n_(m), device_(device) {
initialize();
}
BenchmarkSuite(const Device& device, size_t m, size_t k)
: m_(1), k_(k), n_(m), device_(device) {
initialize();
}
~BenchmarkSuite() {
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
}
void memcpy(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
}
// Record the number of values copied per second
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
void typeCasting(int num_iters) {
eigen_assert(m_ == n_);
Eigen::array<TensorIndex, 2> sizes;
if (sizeof(T) >= sizeof(int)) {
sizes[0] = m_;
sizes[1] = k_;
} else {
sizes[0] = m_ * sizeof(T) / sizeof(int);
sizes[1] = k_ * sizeof(T) / sizeof(int);
}
const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
B.device(device_) = A.template cast<T>();
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.template cast<T>();
}
// Record the number of values copied per second
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void random(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
Eigen::array<TensorIndex, 2> sizes;
sizes[0] = m_;
sizes[1] = m_;
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = C.random();
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = C.random();
}
// Record the number of random numbers generated per second
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
void slicing(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
Eigen::array<TensorIndex, 2> sizes;
sizes[0] = m_;
sizes[1] = m_;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.slice(first_quadrant, quarter_sizes).device(device_) =
A.slice(first_quadrant, quarter_sizes);
C.slice(second_quadrant, quarter_sizes).device(device_) =
B.slice(second_quadrant, quarter_sizes);
C.slice(third_quadrant, quarter_sizes).device(device_) =
A.slice(third_quadrant, quarter_sizes);
C.slice(fourth_quadrant, quarter_sizes).device(device_) =
B.slice(fourth_quadrant, quarter_sizes);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.slice(first_quadrant, quarter_sizes).device(device_) =
A.slice(first_quadrant, quarter_sizes);
C.slice(second_quadrant, quarter_sizes).device(device_) =
B.slice(second_quadrant, quarter_sizes);
C.slice(third_quadrant, quarter_sizes).device(device_) =
A.slice(third_quadrant, quarter_sizes);
C.slice(fourth_quadrant, quarter_sizes).device(device_) =
B.slice(fourth_quadrant, quarter_sizes);
}
// Record the number of values copied from the rhs slice to the lhs slice
// each second
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
void rowChip(int num_iters) {
Eigen::array<TensorIndex, 2> input_size;
input_size[0] = k_;
input_size[1] = n_;
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
Eigen::array<TensorIndex, 1> output_size;
output_size[0] = n_;
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = B.chip(iter % k_, 0);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.chip(iter % k_, 0);
}
// Record the number of values copied from the rhs chip to the lhs.
finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
}
void colChip(int num_iters) {
Eigen::array<TensorIndex, 2> input_size;
input_size[0] = k_;
input_size[1] = n_;
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
Eigen::array<TensorIndex, 1> output_size;
output_size[0] = n_;
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = B.chip(iter % n_, 1);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.chip(iter % n_, 1);
}
// Record the number of values copied from the rhs chip to the lhs.
finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
}
void shuffling(int num_iters) {
eigen_assert(m_ == n_);
Eigen::array<TensorIndex, 2> size_a;
size_a[0] = m_;
size_a[1] = k_;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
Eigen::array<TensorIndex, 2> size_b;
size_b[0] = k_;
size_b[1] = m_;
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
Eigen::array<int, 2> shuffle;
shuffle[0] = 1;
shuffle[1] = 0;
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
B.device(device_) = A.shuffle(shuffle);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.shuffle(shuffle);
}
// Record the number of values shuffled from A and copied to B each second
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void padding(int num_iters) {
eigen_assert(m_ == k_);
Eigen::array<TensorIndex, 2> size_a;
size_a[0] = m_;
size_a[1] = k_-3;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
Eigen::array<TensorIndex, 2> size_b;
size_b[0] = k_;
size_b[1] = m_;
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
#if defined(EIGEN_HAS_INDEX_LIST)
Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
Eigen::type2indexpair<2, 1> > paddings;
#else
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
#endif
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
B.device(device_) = A.pad(paddings);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.pad(paddings);
}
// Record the number of values copied from the padded tensor A each second
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void striding(int num_iters) {
eigen_assert(m_ == k_);
Eigen::array<TensorIndex, 2> size_a;
size_a[0] = m_;
size_a[1] = k_;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
Eigen::array<TensorIndex, 2> size_b;
size_b[0] = m_;
size_b[1] = k_/2;
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::array<TensorIndex, 2> strides;
strides[0] = 1;
strides[1] = 2;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
#endif
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
B.device(device_) = A.stride(strides);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.stride(strides);
}
// Record the number of values copied from the padded tensor A each second
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void broadcasting(int num_iters) {
Eigen::array<TensorIndex, 2> size_a;
size_a[0] = m_;
size_a[1] = 1;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
Eigen::array<TensorIndex, 2> size_c;
size_c[0] = m_;
size_c[1] = n_;
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::array<int, 2> broadcast;
broadcast[0] = 1;
broadcast[1] = n_;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
broadcast.set(1, n_);
#endif
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.broadcast(broadcast);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.broadcast(broadcast);
}
// Record the number of values broadcasted from A and copied to C each second
finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
}
void coeffWiseOp(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
Eigen::array<TensorIndex, 2> sizes;
sizes[0] = m_;
sizes[1] = m_;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
}
// Record the number of FLOP executed per second (2 multiplications and
// 1 addition per value)
finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
}
void algebraicFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
Eigen::array<TensorIndex, 2> sizes;
sizes[0] = m_;
sizes[1] = m_;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
void transcendentalFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
Eigen::array<TensorIndex, 2> sizes;
sizes[0] = m_;
sizes[1] = m_;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.exp() + B.log();
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.exp() + B.log();
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
// Row reduction
void rowReduction(int num_iters) {
Eigen::array<TensorIndex, 2> input_size;
input_size[0] = k_;
input_size[1] = n_;
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
Eigen::array<TensorIndex, 1> output_size;
output_size[0] = n_;
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::array<TensorIndex, 1> sum_along_dim;
sum_along_dim[0] = 0;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
#endif
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = B.sum(sum_along_dim);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.sum(sum_along_dim);
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
}
// Column reduction
void colReduction(int num_iters) {
Eigen::array<TensorIndex, 2> input_size;
input_size[0] = k_;
input_size[1] = n_;
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
b_, input_size);
Eigen::array<TensorIndex, 1> output_size;
output_size[0] = k_;
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
a_, output_size);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::array<TensorIndex, 1> sum_along_dim;
sum_along_dim[0] = 1;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
#endif
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
A.device(device_) = B.sum(sum_along_dim);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
A.device(device_) = B.sum(sum_along_dim);
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
}
// Full reduction
void fullReduction(int num_iters) {
Eigen::array<TensorIndex, 2> input_size;
input_size[0] = k_;
input_size[1] = n_;
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
b_, input_size);
Eigen::array<TensorIndex, 0> output_size;
TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
c_, output_size);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = B.sum();
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.sum();
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
}
// do a contraction which is equivalent to a matrix multiplication
void contraction(int num_iters) {
contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
}
void contractionRowMajor(int num_iters) {
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
}
void contractionRowMajorAT(int num_iters) {
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
}
void contractionRowMajorBT(int num_iters) {
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
}
void contractionRowMajorABT(int num_iters) {
contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
}
void convolution(int num_iters, int kernel_x, int kernel_y) {
Eigen::array<TensorIndex, 2> input_sizes;
input_sizes[0] = m_;
input_sizes[1] = n_;
TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
Eigen::array<TensorIndex, 2> kernel_sizes;
kernel_sizes[0] = kernel_x;
kernel_sizes[1] = kernel_y;
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
Eigen::array<TensorIndex, 2> result_sizes;
result_sizes[0] = m_ - kernel_x + 1;
result_sizes[1] = n_ - kernel_y + 1;
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
Eigen::array<TensorIndex, 2> dims;
dims[0] = 0;
dims[1] = 1;
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.convolve(B, dims);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.convolve(B, dims);
}
// Record the number of FLOPs executed per second (kernel_size
// multiplications and additions for each value in the resulting tensor)
finalizeBenchmark(static_cast<int64_t>(2) *
(m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
}
private:
// do a contraction which is equivalent to a matrix multiplication
template<int Layout>
void contraction(int num_iters, bool trans_a, bool trans_b) {
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = (trans_a ? k_: m_);
sizeA[1] = (trans_a ? m_: k_);
Eigen::array<TensorIndex, 2> sizeB;
sizeB[0] = (trans_b ? n_: k_);
sizeB[1] = (trans_b ? k_: n_);
Eigen::array<TensorIndex, 2> sizeC;
sizeC[0] = m_;
sizeC[1] = n_;
const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
TensorIndex a_contract_dim = (trans_a ? 0 : 1);
TensorIndex b_contract_dim = (trans_b ? 1 : 0);
dims[0] = DimPair(a_contract_dim, b_contract_dim);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.contract(B, dims);
}
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
// Record the number of FLOP executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
}
void initialize() {
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T));
device_.memset(b_, 23, k_ * n_ * sizeof(T));
device_.memset(c_, 31, m_ * n_ * sizeof(T));
}
inline void finalizeBenchmark(int64_t num_items) {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
device_.synchronize();
}
#elif defined(EIGEN_USE_SYCL)
if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
device_.synchronize();
}
#endif
StopBenchmarkTiming();
SetBenchmarkFlopsProcessed(num_items);
}
TensorIndex m_;
TensorIndex k_;
TensorIndex n_;
T* a_;
T* b_;
T* c_;
Device device_;
};
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_

View File

@@ -0,0 +1,168 @@
#define EIGEN_USE_THREADS
#include <string>
#include "tensor_benchmarks.h"
#define CREATE_THREAD_POOL(threads) \
Eigen::ThreadPool pool(threads); \
Eigen::ThreadPoolDevice device(&pool, threads);
// Simple functions
#define BM_FuncCPU(FUNC, THREADS) \
static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
BM_FuncCPU(memcpy, 4);
BM_FuncCPU(memcpy, 8);
BM_FuncCPU(memcpy, 12);
BM_FuncCPU(typeCasting, 4);
BM_FuncCPU(typeCasting, 8);
BM_FuncCPU(typeCasting, 12);
BM_FuncCPU(random, 4);
BM_FuncCPU(random, 8);
BM_FuncCPU(random, 12);
BM_FuncCPU(slicing, 4);
BM_FuncCPU(slicing, 8);
BM_FuncCPU(slicing, 12);
BM_FuncCPU(rowChip, 4);
BM_FuncCPU(rowChip, 8);
BM_FuncCPU(rowChip, 12);
BM_FuncCPU(colChip, 4);
BM_FuncCPU(colChip, 8);
BM_FuncCPU(colChip, 12);
BM_FuncCPU(shuffling, 4);
BM_FuncCPU(shuffling, 8);
BM_FuncCPU(shuffling, 12);
BM_FuncCPU(padding, 4);
BM_FuncCPU(padding, 8);
BM_FuncCPU(padding, 12);
BM_FuncCPU(striding, 4);
BM_FuncCPU(striding, 8);
BM_FuncCPU(striding, 12);
BM_FuncCPU(broadcasting, 4);
BM_FuncCPU(broadcasting, 8);
BM_FuncCPU(broadcasting, 12);
BM_FuncCPU(coeffWiseOp, 4);
BM_FuncCPU(coeffWiseOp, 8);
BM_FuncCPU(coeffWiseOp, 12);
BM_FuncCPU(algebraicFunc, 4);
BM_FuncCPU(algebraicFunc, 8);
BM_FuncCPU(algebraicFunc, 12);
BM_FuncCPU(transcendentalFunc, 4);
BM_FuncCPU(transcendentalFunc, 8);
BM_FuncCPU(transcendentalFunc, 12);
BM_FuncCPU(rowReduction, 4);
BM_FuncCPU(rowReduction, 8);
BM_FuncCPU(rowReduction, 12);
BM_FuncCPU(colReduction, 4);
BM_FuncCPU(colReduction, 8);
BM_FuncCPU(colReduction, 12);
// Contractions
#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
if (THREADS == 1) { \
Eigen::DefaultDevice device; \
BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} else { \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
// Convolutions
#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
suite.FUNC(iters, DIM1, DIM2); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);

View File

@@ -0,0 +1,77 @@
#define EIGEN_USE_GPU
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include "tensor_benchmarks.h"
// Simple functions
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(memcpy);
BM_FuncGPU(typeCasting);
//BM_FuncGPU(random);
BM_FuncGPU(slicing);
BM_FuncGPU(rowChip);
BM_FuncGPU(colChip);
BM_FuncGPU(shuffling);
BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
BM_FuncGPU(algebraicFunc);
BM_FuncGPU(transcendentalFunc);
BM_FuncGPU(rowReduction);
BM_FuncGPU(colReduction);
BM_FuncGPU(fullReduction);
// Contractions
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
// Convolutions
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters, DIM1, DIM2); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
/*
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
*/

View File

@@ -0,0 +1,75 @@
#define EIGEN_USE_GPU
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include "tensor_benchmarks.h"
// Simple functions
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(memcpy);
BM_FuncGPU(typeCasting);
BM_FuncGPU(random);
BM_FuncGPU(slicing);
BM_FuncGPU(rowChip);
BM_FuncGPU(colChip);
BM_FuncGPU(shuffling);
BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
BM_FuncGPU(algebraicFunc);
BM_FuncGPU(transcendentalFunc);
BM_FuncGPU(rowReduction);
BM_FuncGPU(colReduction);
BM_FuncGPU(fullReduction);
// Contractions
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
// Convolutions
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters, DIM1, DIM2); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
BM_FuncWithKernelDimsGPU(convolution, 64, 7);

View File

@@ -0,0 +1,140 @@
#ifdef EIGEN_USE_SYCL
#include <CL/sycl.hpp>
#include <iostream>
#include "tensor_benchmarks.h"
cl::sycl::gpu_selector selector;
Eigen::QueueInterface queue(selector);
#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2) \
static void BM_##FUNC##_##D1##x##D2(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::SyclDevice device(&queue); \
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10);
BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352);
BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352);
BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088);
BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088);
BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272);
BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272);
BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568);
BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568);
BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568);
BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568);
BM_FuncWithInput2DimsGPU(colReduction, 100352, 256);
BM_FuncWithInput2DimsGPU(colReduction, 100352, 64);
BM_FuncWithInput2DimsGPU(colReduction, 25088, 512);
BM_FuncWithInput2DimsGPU(colReduction, 6272, 102);
BM_FuncWithInput2DimsGPU(colReduction, 25088, 128);
BM_FuncWithInput2DimsGPU(colReduction, 6272, 256);
BM_FuncWithInput2DimsGPU(colReduction, 1568, 204);
BM_FuncWithInput2DimsGPU(colReduction, 1568, 512);
BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024);
BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048);
BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 256, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 512, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 64, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 128, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1);
BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1);
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::SyclDevice device(&queue); \
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(rowReduction);
BM_FuncGPU(colReduction);
BM_FuncGPU(fullReduction);
BM_FuncGPU(memcpy);
BM_FuncGPU(typeCasting);
BM_FuncGPU(random);
BM_FuncGPU(slicing);
BM_FuncGPU(rowChip);
BM_FuncGPU(colChip);
BM_FuncGPU(shuffling);
BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
BM_FuncGPU(algebraicFunc);
BM_FuncGPU(transcendentalFunc);
// Contractions
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::SyclDevice device(&queue); \
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N);
BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64);
BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N);
BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64);
BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N);
BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64);
BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N);
BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N);
BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64);
// Convolutions
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
Eigen::SyclDevice device(&queue); \
BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
suite.FUNC(iters, DIM1, DIM2); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
#endif

View File

@@ -0,0 +1,325 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_BENCH_CONTRACT_SYCL
#define EIGEN_BENCH_CONTRACT_SYCL
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#include <SYCL/sycl.hpp>
#include <fstream>
#include <iostream>
#include <chrono>
#include <ctime>
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::array;
using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
std::ofstream out("Result.txt");
std::chrono::time_point<std::chrono::system_clock> get_time(){
std::chrono::time_point<std::chrono::system_clock> start, end;
return std::chrono::system_clock::now();
}
template<typename Start, typename End, typename TensorIndex>
void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
std::chrono::duration<double> elapsed_seconds = end-start;
std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
}
// do a contraction which is equivalent to a matrix multiplication
template<typename T, typename Device, typename TensorIndex>
void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
T* a_;
T* b_;
T* c_;
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T));
device_.memset(b_, 23, k_ * n_ * sizeof(T));
device_.memset(c_, 31, m_ * n_ * sizeof(T));
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = m_;
sizeA[1] = k_;
Eigen::array<TensorIndex, 2> sizeB;
sizeB[0] = k_;
sizeB[1] = n_;
Eigen::array<TensorIndex, 2> sizeC;
sizeC[0] = m_;
sizeC[1] = n_;
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(1, 0);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.contract(B, dims);
}
#endif
auto start = get_time();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
auto end = get_time();
// Record the number of FLOPs executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
device_.synchronize();
}
// do a contraction which is equivalent to a matrix multiplication
template<typename T, typename Device, typename TensorIndex>
void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
T* a_;
T* b_;
T* c_;
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T));
device_.memset(b_, 23, k_ * n_ * sizeof(T));
device_.memset(c_, 31, m_ * n_ * sizeof(T));
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = m_;
sizeA[1] = k_;
Eigen::array<TensorIndex, 2> sizeB;
sizeB[0] = k_;
sizeB[1] = n_;
Eigen::array<TensorIndex, 2> sizeC;
sizeC[0] = m_;
sizeC[1] = n_;
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(1, 0);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.contract(B, dims);
}
#endif
auto start = get_time();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
auto end = get_time();
// Record the number of FLOPs executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
device_.synchronize();
}
template<typename T, typename Device, typename TensorIndex>
void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
T* a_;
T* b_;
T* c_;
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T));
device_.memset(b_, 23, k_ * n_ * sizeof(T));
device_.memset(c_, 31, m_ * n_ * sizeof(T));
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = k_;
sizeA[1] = m_;
Eigen::array<TensorIndex, 2> sizeB;
sizeB[0] = k_;
sizeB[1] = n_;
Eigen::array<TensorIndex, 2> sizeC;
sizeC[0] = m_;
sizeC[1] = n_;
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(0, 0);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.contract(B, dims);
}
#endif
auto start = get_time();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
auto end = get_time();
// Record the number of FLOPs executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
device_.synchronize();
}
template<typename T, typename Device, typename TensorIndex>
void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
T* a_;
T* b_;
T* c_;
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T));
device_.memset(b_, 23, k_ * n_ * sizeof(T));
device_.memset(c_, 31, m_ * n_ * sizeof(T));
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = m_;
sizeA[1] = k_;
Eigen::array<TensorIndex, 2> sizeB;
sizeB[0] = n_;
sizeB[1] = k_;
Eigen::array<TensorIndex, 2> sizeC;
sizeC[0] = m_;
sizeC[1] = n_;
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(1, 1);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.contract(B, dims);
}
#endif
auto start = get_time();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
auto end = get_time();
// Record the number of FLOPs executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
device_.synchronize();
}
template<typename T, typename Device, typename TensorIndex>
void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
T* a_;
T* b_;
T* c_;
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T));
device_.memset(b_, 23, k_ * n_ * sizeof(T));
device_.memset(c_, 31, m_ * n_ * sizeof(T));
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = k_;
sizeA[1] = m_;
Eigen::array<TensorIndex, 2> sizeB;
sizeB[0] = n_;
sizeB[1] = k_;
Eigen::array<TensorIndex, 2> sizeC;
sizeC[0] = m_;
sizeC[1] = n_;
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(0, 1);
#ifdef EIGEN_USE_SYCL // warmup for sycl
for (int iter = 0; iter < 10; ++iter) {
C.device(device_) = A.contract(B, dims);
}
#endif
auto start = get_time();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
auto end = get_time();
// Record the number of FLOPs executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
device_.synchronize();
}
int main() {
cl::sycl::gpu_selector selector;
Eigen::QueueInterface queue(selector);
Eigen::SyclDevice device(&queue);
int64_t num_iters =20;
for(int64_t m = 32; m <= 4096; m *= 2)
for(int64_t k = 32; k <= 4096; k *= 2)
for(int64_t n = 32; n <= 4096; n*= 2){
(contraction<float>(device, num_iters, m, k, n));
(contractionRowMajor<float>(device, num_iters, m, k, n));
(contractionAT<float>(device, num_iters, m, k, n));
(contractionBT<float>(device, num_iters, m, k, n));
(contractionABT<float>(device, num_iters, m, k, n));
}
return 0;
}
#endif // EIGEN_BENCH_CONTRACT_SYCL