9#ifndef GMATTENSOR_DETAIL_HPP
10#define GMATTENSOR_DETAIL_HPP
21template <
class T,
size_t nd>
23 using value_type =
typename T::value_type;
24 using shape_type =
typename T::shape_type;
25 static_assert(xt::has_fixed_rank_t<T>::value,
"Only fixed rank allowed.");
26 static_assert(xt::get_rank<T>::value >= 2,
"Rank too low.");
27 constexpr static size_t rank = xt::get_rank<T>::value - 2;
28 constexpr static size_t stride0 = 1;
29 constexpr static size_t stride2 = nd * nd;
30 constexpr static size_t stride4 = nd * nd * nd * nd;
35 static size_t toSizeT0(
const S& shape)
37 using ST =
typename S::value_type;
38 return std::accumulate(shape.cbegin(), shape.cbegin() + rank, ST(1), std::multiplies<ST>());
45 static std::array<size_t, rank> toShapeT0(
const S& shape)
47 std::array<size_t, rank> ret;
48 std::copy(shape.cbegin(), shape.cbegin() + rank, ret.begin());
53 static std::array<size_t, rank + 2> toShapeT2(
const S& shape)
55 std::array<size_t, rank + 2> ret;
56 std::copy(shape.cbegin(), shape.cbegin() + rank, ret.begin());
63 static std::array<size_t, rank + 4> toShapeT4(
const S& shape)
65 std::array<size_t, rank + 4> ret;
66 std::copy(shape.cbegin(), shape.cbegin() + rank, ret.begin());
78 template <
class R,
typename F>
79 static void ret0(
const T& A, R& ret, F func)
83#pragma omp parallel for
84 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
85 ret.flat(i) = func(&A.flat(i * stride2));
94 static auto ret0(
const T& A, F func) ->
typename allocate<rank, T>::type
96 using return_type =
typename allocate<rank, T>::type;
98 return_type ret = return_type::from_shape(toShapeT0(A.shape()));
99#pragma omp parallel for
100 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
101 ret.flat(i) = func(&A.flat(i * stride2));
109 template <
class R,
typename F>
110 static void B2_ret0(
const T& A,
const T& B, R& ret, F func)
115#pragma omp parallel for
116 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
117 ret.flat(i) = func(&A.flat(i * stride2), &B.flat(i * stride2));
121 template <
typename F>
122 static auto B2_ret0(
const T& A,
const T& B, F func) ->
typename allocate<rank, T>::type
124 using return_type =
typename allocate<rank, T>::type;
127 return_type ret = return_type::from_shape(toShapeT0(A.shape()));
128#pragma omp parallel for
129 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
130 ret.flat(i) = func(&A.flat(i * stride2), &B.flat(i * stride2));
138 template <
class R,
typename F>
139 static void B2_ret2(
const T& A,
const T& B, R& ret, F func)
144#pragma omp parallel for
145 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
146 func(&A.flat(i * stride2), &B.flat(i * stride2), &ret.flat(i * stride2));
150 template <
typename F>
151 static auto B2_ret2(
const T& A,
const T& B, F func) ->
typename allocate<rank + 2, T>::type
153 using return_type =
typename allocate<rank + 2, T>::type;
156 return_type ret = return_type::from_shape(toShapeT2(A.shape()));
157#pragma omp parallel for
158 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
159 func(&A.flat(i * stride2), &B.flat(i * stride2), &ret.flat(i * stride2));
167 template <
class R,
typename F>
168 static void B2_ret4(
const T& A,
const T& B, R& ret, F func)
173#pragma omp parallel for
174 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
175 func(&A.flat(i * stride2), &B.flat(i * stride2), &ret.flat(i * stride4));
179 template <
typename F>
180 static auto B2_ret4(
const T& A,
const T& B, F func) ->
typename allocate<rank + 4, T>::type
182 using return_type =
typename allocate<rank + 4, T>::type;
185 return_type ret = return_type::from_shape(toShapeT4(A.shape()));
186#pragma omp parallel for
187 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
188 func(&A.flat(i * stride2), &B.flat(i * stride2), &ret.flat(i * stride4));
196 template <
class R,
typename F>
197 static void ret2(
const T& A, R& ret, F func)
201#pragma omp parallel for
202 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
203 func(&A.flat(i * stride2), &ret.flat(i * stride2));
207 template <
typename F>
208 static auto ret2(
const T& A, F func) ->
typename allocate<rank + 2, T>::type
210 using return_type =
typename allocate<rank + 2, T>::type;
212 return_type ret = return_type::from_shape(toShapeT2(A.shape()));
213#pragma omp parallel for
214 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
215 func(&A.flat(i * stride2), &ret.flat(i * stride2));
225template <
class T,
size_t nd>
227 using value_type =
typename T::value_type;
228 using shape_type =
typename T::shape_type;
229 static_assert(xt::has_fixed_rank_t<T>::value,
"Only fixed rank allowed.");
230 static_assert(xt::get_rank<T>::value >= 4,
"Rank too low.");
231 constexpr static size_t rank = xt::get_rank<T>::value - 4;
232 constexpr static size_t stride0 = 1;
233 constexpr static size_t stride2 = nd * nd;
234 constexpr static size_t stride4 = nd * nd * nd * nd;
239 static size_t toSizeT0(
const S& shape)
241 using ST =
typename S::value_type;
242 return std::accumulate(shape.cbegin(), shape.cbegin() + rank, ST(1), std::multiplies<ST>());
249 static std::array<size_t, rank> toShapeT0(
const S& shape)
251 std::array<size_t, rank> ret;
252 std::copy(shape.cbegin(), shape.cbegin() + rank, ret.begin());
257 static std::array<size_t, rank + 2> toShapeT2(
const S& shape)
259 std::array<size_t, rank + 2> ret;
260 std::copy(shape.cbegin(), shape.cbegin() + rank, ret.begin());
267 static std::array<size_t, rank + 4> toShapeT4(
const S& shape)
269 std::array<size_t, rank + 4> ret;
270 std::copy(shape.cbegin(), shape.cbegin() + rank, ret.begin());
281 template <
class U,
class R,
typename F>
282 static void B2_ret2(
const T& A,
const U& B, R& ret, F func)
287#pragma omp parallel for
288 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
289 func(&A.flat(i * stride4), &B.flat(i * stride2), &ret.flat(i * stride2));
293 template <
class U,
typename F>
294 static auto B2_ret2(
const T& A,
const U& B, F func) ->
typename allocate<rank + 2, T>::type
296 using return_type =
typename allocate<rank + 2, T>::type;
299 return_type ret = return_type::from_shape(toShapeT2(A.shape()));
300#pragma omp parallel for
301 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
302 func(&A.flat(i * stride4), &B.flat(i * stride2), &ret.flat(i * stride2));
310 template <
class U,
class R,
typename F>
311 static void B2_ret4(
const T& A,
const U& B, R& ret, F func)
316#pragma omp parallel for
317 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
318 func(&A.flat(i * stride4), &B.flat(i * stride2), &ret.flat(i * stride4));
322 template <
class U,
typename F>
323 static auto B2_ret4(
const T& A,
const U& B, F func) ->
typename allocate<rank + 4, T>::type
325 using return_type =
typename allocate<rank + 4, T>::type;
328 return_type ret = return_type::from_shape(toShapeT4(A.shape()));
329#pragma omp parallel for
330 for (
size_t i = 0; i < toSizeT0(A.shape()); ++i) {
331 func(&A.flat(i * stride4), &B.flat(i * stride2), &ret.flat(i * stride4));
Macros used in the library.
#define GMATTENSOR_ASSERT(expr)
All assertions are implementation as:
Tensor products / operations.