ADD: new track message, Entity class and Position class

2022-12-20 17:20:35 +01:00
parent 469ecfb099
commit 98ebb563a8
2114 changed files with 482360 additions and 24 deletions
--- a/libs/eigen/bench/tensors/README
+++ b/libs/eigen/bench/tensors/README
@@ -0,0 +1,20 @@
+The tensor benchmark suite is made of several parts.
+
+The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
+
+To compile the floating point CPU benchmarks, simply call:
+g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the floating point GPU benchmarks, simply call:
+nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu
+
+We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
+nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
+
+To compile and run the benchmark for SYCL, using ComputeCpp, simply run the
+following commands:
+1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
+2. bash eigen_sycl_bench.sh
+
+Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
--- a/libs/eigen/bench/tensors/benchmark.h
+++ b/libs/eigen/bench/tensors/benchmark.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <vector>
+
+namespace testing {
+class Benchmark {
+ public:
+  Benchmark(const char* name, void (*fn)(int)) {
+    Register(name, fn, NULL);
+  }
+  Benchmark(const char* name, void (*fn_range)(int, int)) {
+    Register(name, NULL, fn_range);
+  }
+  Benchmark* Arg(int x);
+  Benchmark* Range(int lo, int hi);
+  const char* Name();
+  bool ShouldRun(int argc, char* argv[]);
+  void Run();
+ private:
+  const char* name_;
+  void (*fn_)(int);
+  void (*fn_range_)(int, int);
+  std::vector<int> args_;
+  void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
+  void RunRepeatedlyWithArg(int iterations, int arg);
+  void RunWithArg(int arg);
+};
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+#define BENCHMARK(f) \
+    static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
+        (new ::testing::Benchmark(#f, f))
--- a/libs/eigen/bench/tensors/benchmark_main.cc
+++ b/libs/eigen/bench/tensors/benchmark_main.cc
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.h"
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <inttypes.h>
+#include <time.h>
+#include <map>
+
+static int64_t g_flops_processed;
+static int64_t g_benchmark_total_time_ns;
+static int64_t g_benchmark_start_time_ns;
+typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
+typedef BenchmarkMap::iterator BenchmarkMapIt;
+
+BenchmarkMap& gBenchmarks() {
+  static BenchmarkMap g_benchmarks;
+  return g_benchmarks;
+}
+
+static int g_name_column_width = 20;
+
+static int Round(int n) {
+  int base = 1;
+  while (base*10 < n) {
+    base *= 10;
+  }
+  if (n < 2*base) {
+    return 2*base;
+  }
+  if (n < 5*base) {
+    return 5*base;
+  }
+  return 10*base;
+}
+
+#ifdef __APPLE__
+  #include <mach/mach_time.h>
+  static mach_timebase_info_data_t g_time_info;
+  static void __attribute__((constructor)) init_info() {
+    mach_timebase_info(&g_time_info);
+  }
+#endif
+
+static int64_t NanoTime() {
+#if defined(__APPLE__)
+  uint64_t t = mach_absolute_time();
+  return t * g_time_info.numer / g_time_info.denom;
+#else
+  struct timespec t;
+  t.tv_sec = t.tv_nsec = 0;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
+#endif
+}
+
+namespace testing {
+Benchmark* Benchmark::Arg(int arg) {
+  args_.push_back(arg);
+  return this;
+}
+
+Benchmark* Benchmark::Range(int lo, int hi) {
+  const int kRangeMultiplier = 8;
+  if (hi < lo) {
+    int temp = hi;
+    hi = lo;
+    lo = temp;
+  }
+  while (lo < hi) {
+    args_.push_back(lo);
+    lo *= kRangeMultiplier;
+  }
+  // We always run the hi number.
+  args_.push_back(hi);
+  return this;
+}
+
+const char* Benchmark::Name() {
+  return name_;
+}
+bool Benchmark::ShouldRun(int argc, char* argv[]) {
+  if (argc == 1) {
+    return true;  // With no arguments, we run all benchmarks.
+  }
+  // Otherwise, we interpret each argument as a regular expression and
+  // see if any of our benchmarks match.
+  for (int i = 1; i < argc; i++) {
+    regex_t re;
+    if (regcomp(&re, argv[i], 0) != 0) {
+      fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    }
+    int match = regexec(&re, name_, 0, NULL, 0);
+    regfree(&re);
+    if (match != REG_NOMATCH) {
+      return true;
+    }
+  }
+  return false;
+}
+void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
+  name_ = name;
+  fn_ = fn;
+  fn_range_ = fn_range;
+  if (fn_ == NULL && fn_range_ == NULL) {
+    fprintf(stderr, "%s: missing function\n", name_);
+    exit(EXIT_FAILURE);
+  }
+  gBenchmarks().insert(std::make_pair(name, this));
+}
+void Benchmark::Run() {
+  if (fn_ != NULL) {
+    RunWithArg(0);
+  } else {
+    if (args_.empty()) {
+      fprintf(stderr, "%s: no args!\n", name_);
+      exit(EXIT_FAILURE);
+    }
+    for (size_t i = 0; i < args_.size(); ++i) {
+      RunWithArg(args_[i]);
+    }
+  }
+}
+void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
+  g_flops_processed = 0;
+  g_benchmark_total_time_ns = 0;
+  g_benchmark_start_time_ns = NanoTime();
+  if (fn_ != NULL) {
+    fn_(iterations);
+  } else {
+    fn_range_(iterations, arg);
+  }
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+}
+void Benchmark::RunWithArg(int arg) {
+  // run once in case it's expensive
+  int iterations = 1;
+  RunRepeatedlyWithArg(iterations, arg);
+  while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
+    int last = iterations;
+    if (g_benchmark_total_time_ns/iterations == 0) {
+      iterations = 1e9;
+    } else {
+      iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
+    }
+    iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
+    iterations = Round(iterations);
+    RunRepeatedlyWithArg(iterations, arg);
+  }
+  char throughput[100];
+  throughput[0] = '\0';
+  if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) {
+    double mflops_processed = static_cast<double>(g_flops_processed)/1e6;
+    double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
+    snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds);
+  }
+  char full_name[100];
+  if (fn_range_ != NULL) {
+    if (arg >= (1<<20)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
+    } else if (arg >= (1<<10)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
+    } else {
+      snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
+    }
+  } else {
+    snprintf(full_name, sizeof(full_name), "%s", name_);
+  }
+  printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
+         iterations, g_benchmark_total_time_ns/iterations, throughput);
+  fflush(stdout);
+}
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t x) {
+  g_flops_processed = x;
+}
+void StopBenchmarkTiming() {
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+  g_benchmark_start_time_ns = 0;
+}
+void StartBenchmarkTiming() {
+  if (g_benchmark_start_time_ns == 0) {
+    g_benchmark_start_time_ns = NanoTime();
+  }
+}
+int main(int argc, char* argv[]) {
+  if (gBenchmarks().empty()) {
+    fprintf(stderr, "No benchmarks registered!\n");
+    exit(EXIT_FAILURE);
+  }
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    int name_width = static_cast<int>(strlen(it->second->Name()));
+    g_name_column_width = std::max(g_name_column_width, name_width);
+  }
+  bool need_header = true;
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    ::testing::Benchmark* b = it->second;
+    if (b->ShouldRun(argc, argv)) {
+      if (need_header) {
+        printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
+        fflush(stdout);
+        need_header = false;
+      }
+      b->Run();
+    }
+  }
+  if (need_header) {
+    fprintf(stderr, "No matching benchmarks!\n");
+    fprintf(stderr, "Available benchmarks:\n");
+    for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+      fprintf(stderr, "  %s\n", it->second->Name());
+    }
+    exit(EXIT_FAILURE);
+  }
+  return 0;
+}
--- a/libs/eigen/bench/tensors/contraction_benchmarks_cpu.cc
+++ b/libs/eigen/bench/tensors/contraction_benchmarks_cpu.cc
@@ -0,0 +1,39 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensor_benchmarks.h"
+
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);
+
+
+// Contractions for number of threads ranging from 1 to 32
+// Dimensions are Rows, Cols, Depth
+#define BM_ContractionCPU(D1, D2, D3)                                         \
+  static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \
+    StopBenchmarkTiming();                                                    \
+    CREATE_THREAD_POOL(Threads);                                              \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+    suite.contraction(iters);                                                 \
+  }                                                                           \
+  BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32);
+
+
+// Vector Matrix and Matrix Vector products
+BM_ContractionCPU(1, 2000, 500);
+BM_ContractionCPU(2000, 1, 500);
+
+// Various skinny matrices
+BM_ContractionCPU(250, 3, 512);
+BM_ContractionCPU(1500, 3, 512);
+
+BM_ContractionCPU(512, 800, 4);
+BM_ContractionCPU(512, 80, 800);
+BM_ContractionCPU(512, 80, 13522);
+BM_ContractionCPU(1, 80, 13522);
+
+BM_ContractionCPU(3200, 512, 4);
+BM_ContractionCPU(3200, 512, 80);
+BM_ContractionCPU(3200, 80, 512);
--- a/libs/eigen/bench/tensors/eigen_sycl_bench.sh
+++ b/libs/eigen/bench/tensors/eigen_sycl_bench.sh
@@ -0,0 +1,30 @@
+rm -f tensor_benchmark_sycl
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \
+tensor_benchmarks_sycl.cc \
+benchmark_main.cc \
+-I ../../ \
+-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \
+-std=c++11 \
+-march=native \
+-O3 \
+-DNDEBUG \
+-DEIGEN_MPL2_ONLY \
+-DEIGEN_USE_SYCL=1 \
+-DEIGEN_SYCL_LOCAL_MEM=1 \
+-no-serial-memop \
+-mllvm \
+-inline-threshold=10000 \
+-fsycl-ih-last \
+-sycl-driver \
+-Xclang -cl-mad-enable \
+-lOpenCL \
+-lComputeCpp \
+-lpthread \
+-o \
+tensor_benchmark_sycl\
+${@:1}
+
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_benchmark_sycl
--- a/libs/eigen/bench/tensors/eigen_sycl_bench_contract.sh
+++ b/libs/eigen/bench/tensors/eigen_sycl_bench_contract.sh
@@ -0,0 +1,7 @@
+rm -f tensor_contract_sycl_bench
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++  tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1}
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_contract_sycl_bench
+
--- a/libs/eigen/bench/tensors/tensor_benchmarks.h
+++ b/libs/eigen/bench/tensors/tensor_benchmarks.h
@@ -0,0 +1,597 @@
+#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+
+typedef int TensorIndex;
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "benchmark.h"
+
+#define BENCHMARK_RANGE(bench, lo, hi) \
+  BENCHMARK(bench)->Range(lo, hi)
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+// TODO(bsteiner): also templatize on the input type since we have users
+// for int8 as well as floats.
+template <typename Device, typename T> class BenchmarkSuite {
+ public:
+  BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
+      : m_(m), k_(k), n_(n), device_(device) {
+    initialize();
+  }
+
+  BenchmarkSuite(const Device& device, size_t m)
+      : m_(m), k_(m), n_(m), device_(device) {
+    initialize();
+  }
+
+  BenchmarkSuite(const Device& device, size_t m, size_t k)
+      : m_(1), k_(k), n_(m), device_(device) {
+    initialize();
+  }
+
+  ~BenchmarkSuite() {
+    device_.deallocate(a_);
+    device_.deallocate(b_);
+    device_.deallocate(c_);
+  }
+
+  void memcpy(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void typeCasting(int num_iters) {
+    eigen_assert(m_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    if (sizeof(T) >= sizeof(int)) {
+      sizes[0] = m_;
+      sizes[1] = k_;
+    } else {
+      sizes[0] = m_ * sizeof(T) / sizeof(int);
+      sizes[1] = k_ * sizeof(T) / sizeof(int);
+    }
+    const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
+    TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.template cast<T>();
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.template cast<T>();
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+  void random(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = C.random();
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = C.random();
+    }
+    // Record the number of random numbers generated per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void slicing(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
+    const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
+    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+    // Record the number of values copied from the rhs slice to the lhs slice
+    // each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void rowChip(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
+  }
+
+  void colChip(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
+  }
+
+  void shuffling(int num_iters) {
+    eigen_assert(m_ == n_);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = k_;
+    size_b[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+    Eigen::array<int, 2> shuffle;
+    shuffle[0] = 1;
+    shuffle[1] = 0;
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+    // Record the number of values shuffled from A and copied to B each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+ void padding(int num_iters) {
+    eigen_assert(m_ == k_);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_-3;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = k_;
+    size_b[1] = m_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+#if defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
+                         Eigen::type2indexpair<2, 1> > paddings;
+#else
+    Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
+    paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
+    paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
+#endif
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+ void striding(int num_iters) {
+    eigen_assert(m_ == k_);
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = k_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_b;
+    size_b[0] = m_;
+    size_b[1] = k_/2;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<TensorIndex, 2> strides;
+    strides[0] = 1;
+    strides[1] = 2;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
+#endif
+
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
+  }
+
+
+  void broadcasting(int num_iters) {
+    Eigen::array<TensorIndex, 2> size_a;
+    size_a[0] = m_;
+    size_a[1] = 1;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+    Eigen::array<TensorIndex, 2> size_c;
+    size_c[0] = m_;
+    size_c[1] = n_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<int, 2> broadcast;
+    broadcast[0] = 1;
+    broadcast[1] = n_;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
+    broadcast.set(1, n_);
+#endif
+
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+    // Record the number of values broadcasted from A and copied to C each second
+    finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
+  }
+
+  void coeffWiseOp(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
+    }
+    // Record the number of FLOP executed per second (2 multiplications and
+    // 1 addition per value)
+    finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
+  }
+
+  void algebraicFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+}
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void transcendentalFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    Eigen::array<TensorIndex, 2> sizes;
+    sizes[0] = m_;
+    sizes[1] = m_;
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+ // Row reduction
+  void rowReduction(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = n_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<TensorIndex, 1> sum_along_dim;
+    sum_along_dim[0] = 0;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
+#endif
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  // Column reduction
+  void colReduction(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
+        b_, input_size);
+    Eigen::array<TensorIndex, 1> output_size;
+    output_size[0] = k_;
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
+        a_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    Eigen::array<TensorIndex, 1> sum_along_dim;
+    sum_along_dim[0] = 1;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
+#endif
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    A.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      A.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  // Full reduction
+  void fullReduction(int num_iters) {
+    Eigen::array<TensorIndex, 2> input_size;
+    input_size[0] = k_;
+    input_size[1] = n_;
+    const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
+        b_, input_size);
+    Eigen::array<TensorIndex, 0> output_size;
+    TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
+        c_, output_size);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.sum();
+    }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  
+
+  // do a contraction which is equivalent to a matrix multiplication
+  void contraction(int num_iters) {
+      contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
+  }
+
+    void contractionRowMajor(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
+  }
+    
+  void contractionRowMajorAT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
+  }
+
+  void contractionRowMajorBT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
+  }
+
+  void contractionRowMajorABT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
+  }
+
+  void convolution(int num_iters, int kernel_x, int kernel_y) {
+    Eigen::array<TensorIndex, 2> input_sizes;
+    input_sizes[0] = m_;
+    input_sizes[1] = n_;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
+    Eigen::array<TensorIndex, 2> kernel_sizes;
+    kernel_sizes[0] = kernel_x;
+    kernel_sizes[1] = kernel_y;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
+    Eigen::array<TensorIndex, 2> result_sizes;
+    result_sizes[0] = m_ - kernel_x + 1;
+    result_sizes[1] = n_ - kernel_y + 1;
+    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
+    Eigen::array<TensorIndex, 2> dims;
+    dims[0] = 0;
+    dims[1] = 1;
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+     }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+    }
+    // Record the number of FLOPs executed per second (kernel_size
+    // multiplications and additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64_t>(2) *
+        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
+  }
+
+ private:
+ // do a contraction which is equivalent to a matrix multiplication
+  template<int Layout>
+  void contraction(int num_iters, bool trans_a, bool trans_b) {
+    Eigen::array<TensorIndex, 2> sizeA;
+    sizeA[0] = (trans_a ? k_: m_);
+    sizeA[1] = (trans_a ? m_:  k_);
+    Eigen::array<TensorIndex, 2> sizeB;
+    sizeB[0] = (trans_b ? n_: k_);
+    sizeB[1] = (trans_b ? k_: n_);
+    Eigen::array<TensorIndex, 2> sizeC;
+    sizeC[0] = m_;
+    sizeC[1] = n_;
+
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    TensorIndex a_contract_dim = (trans_a ? 0 : 1);
+    TensorIndex b_contract_dim = (trans_b ? 1 : 0);
+    dims[0] = DimPair(a_contract_dim, b_contract_dim);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+     }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+    }
+    // Record the number of FLOP executed per second (size_ multiplications and
+    // additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  }
+
+  void initialize() {
+    a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+    b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+    c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+    // Initialize the content of the memory pools to prevent asan from
+    // complaining.
+    device_.memset(a_, 12, m_ * k_ * sizeof(T));
+    device_.memset(b_, 23, k_ * n_ * sizeof(T));
+    device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  }
+
+  inline void finalizeBenchmark(int64_t num_items) {
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+    if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+      device_.synchronize();
+    }
+#elif defined(EIGEN_USE_SYCL)
+    if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
+      device_.synchronize();
+    }
+
+#endif
+    StopBenchmarkTiming();
+    SetBenchmarkFlopsProcessed(num_items);
+  }
+
+
+  TensorIndex m_;
+  TensorIndex k_;
+  TensorIndex n_;
+  T* a_;
+  T* b_;
+  T* c_;
+  Device device_;
+};
+#endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
--- a/libs/eigen/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/libs/eigen/bench/tensors/tensor_benchmarks_cpu.cc
@@ -0,0 +1,168 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensor_benchmarks.h"
+
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);
+
+// Simple functions
+#define BM_FuncCPU(FUNC, THREADS)                                    \
+  static void BM_##FUNC##_##THREADS##T(int iters, int N) {           \
+    StopBenchmarkTiming();                                           \
+    CREATE_THREAD_POOL(THREADS);                                     \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                               \
+  }                                                                  \
+  BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
+
+BM_FuncCPU(memcpy, 4);
+BM_FuncCPU(memcpy, 8);
+BM_FuncCPU(memcpy, 12);
+
+BM_FuncCPU(typeCasting, 4);
+BM_FuncCPU(typeCasting, 8);
+BM_FuncCPU(typeCasting, 12);
+
+BM_FuncCPU(random, 4);
+BM_FuncCPU(random, 8);
+BM_FuncCPU(random, 12);
+
+BM_FuncCPU(slicing, 4);
+BM_FuncCPU(slicing, 8);
+BM_FuncCPU(slicing, 12);
+
+BM_FuncCPU(rowChip, 4);
+BM_FuncCPU(rowChip, 8);
+BM_FuncCPU(rowChip, 12);
+
+BM_FuncCPU(colChip, 4);
+BM_FuncCPU(colChip, 8);
+BM_FuncCPU(colChip, 12);
+
+BM_FuncCPU(shuffling, 4);
+BM_FuncCPU(shuffling, 8);
+BM_FuncCPU(shuffling, 12);
+
+BM_FuncCPU(padding, 4);
+BM_FuncCPU(padding, 8);
+BM_FuncCPU(padding, 12);
+
+BM_FuncCPU(striding, 4);
+BM_FuncCPU(striding, 8);
+BM_FuncCPU(striding, 12);
+
+BM_FuncCPU(broadcasting, 4);
+BM_FuncCPU(broadcasting, 8);
+BM_FuncCPU(broadcasting, 12);
+
+BM_FuncCPU(coeffWiseOp, 4);
+BM_FuncCPU(coeffWiseOp, 8);
+BM_FuncCPU(coeffWiseOp, 12);
+
+BM_FuncCPU(algebraicFunc, 4);
+BM_FuncCPU(algebraicFunc, 8);
+BM_FuncCPU(algebraicFunc, 12);
+
+BM_FuncCPU(transcendentalFunc, 4);
+BM_FuncCPU(transcendentalFunc, 8);
+BM_FuncCPU(transcendentalFunc, 12);
+
+BM_FuncCPU(rowReduction, 4);
+BM_FuncCPU(rowReduction, 8);
+BM_FuncCPU(rowReduction, 12);
+
+BM_FuncCPU(colReduction, 4);
+BM_FuncCPU(colReduction, 8);
+BM_FuncCPU(colReduction, 12);
+
+
+// Contractions
+#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                      \
+  static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
+    StopBenchmarkTiming();                                                      \
+    if (THREADS == 1) {                                                         \
+      Eigen::DefaultDevice device;                                              \
+      BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3);    \
+      suite.FUNC(iters);                                                        \
+    } else {                                                                    \
+      CREATE_THREAD_POOL(THREADS);                                              \
+      BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+      suite.FUNC(iters);                                                        \
+    }                                                                           \
+  }                                                                             \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
+
+
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS)                    \
+  static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
+    StopBenchmarkTiming();                                                     \
+    CREATE_THREAD_POOL(THREADS);                                               \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N);	       \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
--- a/libs/eigen/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/libs/eigen/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -0,0 +1,77 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+// Simple functions
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+//BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3);   \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+/*
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
+*/
--- a/libs/eigen/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/libs/eigen/bench/tensors/tensor_benchmarks_gpu.cu
@@ -0,0 +1,75 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+// Simple functions
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3);         \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
--- a/libs/eigen/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/libs/eigen/bench/tensors/tensor_benchmarks_sycl.cc
@@ -0,0 +1,140 @@
+#ifdef EIGEN_USE_SYCL
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+cl::sycl::gpu_selector selector;
+Eigen::QueueInterface queue(selector);
+#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2)                      \
+  static void BM_##FUNC##_##D1##x##D2(int iters, int N) {           \
+    StopBenchmarkTiming();                                          \
+    Eigen::SyclDevice device(&queue);                               \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2); \
+    suite.FUNC(iters);                                              \
+  }                                                                 \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10);
+
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568);
+
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 64);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 102);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 128);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 204);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048);
+BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 256, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 512, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 64, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 128, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1);
+#define BM_FuncGPU(FUNC)                                       \
+  static void BM_##FUNC(int iters, int N) {                    \
+    StopBenchmarkTiming();                                     \
+    Eigen::SyclDevice device(&queue);                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                         \
+  }                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                       \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {        \
+    StopBenchmarkTiming();                                              \
+    Eigen::SyclDevice device(&queue);                                   \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3); \
+    suite.FUNC(iters);                                                  \
+  }                                                                     \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64);
+
+
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64);
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {  \
+    StopBenchmarkTiming();                                     \
+    Eigen::SyclDevice device(&queue);                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters, DIM1, DIM2);                             \
+  }                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
+#endif
--- a/libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc
+++ b/libs/eigen/bench/tensors/tensor_contract_sycl_bench.cc
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#include <SYCL/sycl.hpp>
+#include <fstream>
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+std::ofstream out("Result.txt");
+
+std::chrono::time_point<std::chrono::system_clock> get_time(){
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  return std::chrono::system_clock::now();
+}
+
+template<typename Start, typename End, typename TensorIndex>
+void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+    out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+    static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+}
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+ auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+int main() {
+  cl::sycl::gpu_selector selector;
+  Eigen::QueueInterface queue(selector);
+  Eigen::SyclDevice device(&queue);
+  int64_t num_iters =20;
+  for(int64_t m = 32; m <= 4096; m *= 2)
+    for(int64_t k = 32; k <= 4096; k *= 2)
+      for(int64_t n = 32; n <= 4096; n*= 2){
+        (contraction<float>(device, num_iters, m, k, n));
+        (contractionRowMajor<float>(device, num_iters, m, k, n));
+        (contractionAT<float>(device, num_iters, m, k, n));
+        (contractionBT<float>(device, num_iters, m, k, n));
+        (contractionABT<float>(device, num_iters, m, k, n));
+      }
+  return 0;
+  }
+
+#endif // EIGEN_BENCH_CONTRACT_SYCL