I am investigating expression template performance in matrix multiplication. I am unrolling the loop in the matrix multiplication, and I find that the performance of native doubles is equal to that of the expression templates until the depth of the expression gets too large, and the expression template performance goes downhill. Here's the code:
#include <iostream>
#include <vector>
#include <chrono>
template<typename T>
struct Real
{
typedef T RealType;
Real() noexcept : m_real(0) {}
inline explicit Real(RealType real) noexcept
: m_real(real)
{
}
inline RealType value() const noexcept
{
return m_real;
}
template<typename Expr>
void operator+=(const Expr& expr)
{
m_real += expr.value();
}
RealType m_real;
};
#define DEFINE_BINARY_OPERATOR(NAME, OP) \
template<typename Expr1, typename Expr2> \
struct NAME \
{ \
typedef typename Expr1::RealType RealType; \
\
NAME() noexcept {} \
NAME(const Expr1& e1, const Expr2& e2) noexcept \
: m_e1(e1), m_e2(e2) {} \
\
inline RealType value() const noexcept \
{ \
return m_e1.value() OP m_e2.value(); \
} \
\
Expr1 m_e1; \
Expr2 m_e2; \
}; \
template<typename Expr1, typename Expr2> \
inline decltype(auto) operator OP (const Expr1& e1, const Expr2& e2) noexcept\
{ \
return NAME<Expr1, Expr2>(e1, e2); \
} \
DEFINE_BINARY_OPERATOR(Multiply, *)
DEFINE_BINARY_OPERATOR(Add, +)
DEFINE_BINARY_OPERATOR(Subtract, -)
DEFINE_BINARY_OPERATOR(Divide, /)
template<typename T>
struct Matrix
{
explicit Matrix(size_t size)
: m_matrix(size, std::vector<T>(size))
{
}
explicit Matrix(size_t size, const T& intialVal)
: m_matrix(size, std::vector<T>(size, intialVal))
{
}
std::vector<T>& operator[](size_t row) { return m_matrix[row]; }
const std::vector<T>& operator[](size_t row) const { return m_matrix[row]; }
size_t size() const { return m_matrix.size(); }
std::vector<std::vector<T> > m_matrix;
};
#define MATRIX_MULT_KERNEL(N) m1[i][k+N] * m2[j][k+N]
#define MATRIX_MULT_ADD_KERNELS_4(N) \
MATRIX_MULT_KERNEL(N) + MATRIX_MULT_KERNEL(N+1) + MATRIX_MULT_KERNEL(N+2) + MATRIX_MULT_KERNEL(N+3)
template<typename T>
Matrix<T> operator*(const Matrix<T>& m1, const Matrix<T>& m2)
{
if (m1.size() != m2.size())
throw std::runtime_error("wrong sizes");
Matrix<T> m3(m1.size());
for (size_t i = 0; i < m1.size(); ++i)
for (size_t j = 0; j < m1.size(); ++j)
for (size_t k = 0; k < m1.size(); k+=16)
{
auto v0 = MATRIX_MULT_ADD_KERNELS_4(0);
auto v1 = MATRIX_MULT_ADD_KERNELS_4(4);
auto v2 = MATRIX_MULT_ADD_KERNELS_4(8);
auto v3 = MATRIX_MULT_ADD_KERNELS_4(12);
// auto v4 = MATRIX_MULT_ADD_KERNELS_4(16);
// auto v5 = MATRIX_MULT_ADD_KERNELS_4(20);
// auto v6 = MATRIX_MULT_ADD_KERNELS_4(24);
// auto v7 = MATRIX_MULT_ADD_KERNELS_4(28);
auto expr = (v0 + v1 + v2 + v3);// + v4 + v5 + v6 + v7);
m3[i][j] += expr;
}
return m3;
}
decltype(auto) now()
{
return std::chrono::high_resolution_clock::now();
}
decltype(auto) milliseconds(const decltype(now())& start, const decltype(now())& end)
{
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
}
int main()
{
constexpr static const int SIZE = 1024;
{
Matrix<double> m1(SIZE, 1.0);
Matrix<double> m2(SIZE, 1.0);
auto begin = now();
Matrix<double> m3 = m1 * m2;
auto end = now();
std::cout << milliseconds(begin, end) << "ms" << std::endl;
}
{
Matrix<Real<double> > m1(SIZE, Real<double>(1.0));
Matrix<Real<double> > m2(SIZE, Real<double>(1.0));
auto begin = now();
Matrix<Real<double> > m3 = m1 * m2;
auto end = now();
std::cout << milliseconds(begin, end) << "ms" << std::endl;
}
}
If I uncomment out the code in the matrix multiplication loop, and increment k by 32, the expression templates takes three times longer. Does anyone know why this might be happening?
I'm compiling on GCC 5.4 on cygwin on an Intel Xeon E3-1225 V2.