1 #ifndef flexGradientOperator_H 2 #define flexGradientOperator_H 5 #include "flexLinearOperator.h" 9 __global__
void dxp2dCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
mySign s)
11 const size_t x = threadIdx.x + blockIdx.x * blockDim.x;
12 const size_t y = threadIdx.y + blockIdx.y * blockDim.y;
14 if (x >= sizeX || y >= sizeY)
17 const size_t tmpIndex = x + y * sizeX;
25 output[tmpIndex] += input[tmpIndex + 1] - input[tmpIndex];
30 output[tmpIndex] -= input[tmpIndex + 1] - input[tmpIndex];
35 output[tmpIndex] = input[tmpIndex + 1] - input[tmpIndex];
43 __global__
void dxp2dTransposedCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
mySign s)
45 const size_t x = threadIdx.x + blockIdx.x * blockDim.x;
46 const size_t y = threadIdx.y + blockIdx.y * blockDim.y;
48 if (x >= sizeX || y >= sizeY)
51 const size_t tmpIndex = x + y * sizeX;
53 if (x < sizeX - 1 && x > 0)
59 output[tmpIndex] += -(input[tmpIndex] - input[tmpIndex - 1]);
64 output[tmpIndex] -= -(input[tmpIndex] - input[tmpIndex - 1]);
69 output[tmpIndex] = -(input[tmpIndex] - input[tmpIndex - 1]);
80 output[tmpIndex] += -(input[tmpIndex]);
85 output[tmpIndex] -= -(input[tmpIndex]);
90 output[tmpIndex] = -(input[tmpIndex]);
101 output[tmpIndex] += (input[tmpIndex - 1]);
106 output[tmpIndex] -= (input[tmpIndex - 1]);
111 output[tmpIndex] = (input[tmpIndex - 1]);
119 __global__
void dyp2dCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
mySign s)
121 const size_t x = threadIdx.x + blockIdx.x * blockDim.x;
122 const size_t y = threadIdx.y + blockIdx.y * blockDim.y;
124 if (x >= sizeX || y >= sizeY)
127 const size_t tmpIndex = x + y * sizeX;
135 output[tmpIndex] += input[tmpIndex + sizeX] - input[tmpIndex];
140 output[tmpIndex] -= input[tmpIndex + sizeX] - input[tmpIndex];
145 output[tmpIndex] = input[tmpIndex + sizeX] - input[tmpIndex];
153 __global__
void dyp2dTransposedCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
mySign s)
155 const size_t x = threadIdx.x + blockIdx.x * blockDim.x;
156 const size_t y = threadIdx.y + blockIdx.y * blockDim.y;
158 if (x >= sizeX || y >= sizeY)
161 const size_t tmpIndex = x + y * sizeX;
163 if (y < sizeY - 1 && y > 0)
169 output[tmpIndex] += -(input[tmpIndex] - input[tmpIndex - sizeX]);
174 output[tmpIndex] -= -(input[tmpIndex] - input[tmpIndex - sizeX]);
179 output[tmpIndex] = -(input[tmpIndex] - input[tmpIndex - sizeX]);
190 output[tmpIndex] += -(input[tmpIndex]);
195 output[tmpIndex] -= -(input[tmpIndex]);
200 output[tmpIndex] = -(input[tmpIndex]);
211 output[tmpIndex] += (input[tmpIndex - sizeX]);
216 output[tmpIndex] -= (input[tmpIndex - sizeX]);
221 output[tmpIndex] = (input[tmpIndex - sizeX]);
230 int getGlobalIdx_3D_3D(){
231 int blockId = blockIdx.x + blockIdx.y * gridDim.x
232 + gridDim.x * gridDim.y * blockIdx.z;
233 int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
234 + (threadIdx.z * (blockDim.x * blockDim.y))
235 + (threadIdx.y * blockDim.x) + threadIdx.x;
240 __global__
void dxp3dCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
const size_t sizeZ,
mySign s)
242 const size_t x = blockDim.x * blockIdx.x + threadIdx.x;
243 const size_t y = blockDim.y * blockIdx.y + threadIdx.y;
244 const size_t z = blockDim.z * blockIdx.z + threadIdx.z;
246 const size_t tmpIndex = x + sizeX * y + sizeX * sizeY * z;
248 if (x >= sizeX || y >= sizeY || z >= sizeZ)
257 output[tmpIndex] += input[tmpIndex + 1] - input[tmpIndex];
262 output[tmpIndex] -= input[tmpIndex + 1] - input[tmpIndex];
267 output[tmpIndex] = input[tmpIndex + 1] - input[tmpIndex];
275 __global__
void dxp3dTransposedCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
const size_t sizeZ,
mySign s)
277 const size_t x = blockDim.x * blockIdx.x + threadIdx.x;
278 const size_t y = blockDim.y * blockIdx.y + threadIdx.y;
279 const size_t z = blockDim.z * blockIdx.z + threadIdx.z;
281 const size_t tmpIndex = x + sizeX * y + sizeX * sizeY * z;
283 if (x >= sizeX || y >= sizeY || z >= sizeZ)
286 if (x < sizeX - 1 && x > 0)
292 output[tmpIndex] += -(input[tmpIndex] - input[tmpIndex - 1]);
297 output[tmpIndex] -= -(input[tmpIndex] - input[tmpIndex - 1]);
302 output[tmpIndex] = -(input[tmpIndex] - input[tmpIndex - 1]);
313 output[tmpIndex] += -(input[tmpIndex]);
318 output[tmpIndex] -= -(input[tmpIndex]);
323 output[tmpIndex] = -(input[tmpIndex]);
334 output[tmpIndex] += (input[tmpIndex - 1]);
339 output[tmpIndex] -= (input[tmpIndex - 1]);
344 output[tmpIndex] = (input[tmpIndex - 1]);
352 __global__
void dyp3dCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
const size_t sizeZ,
mySign s)
354 const size_t x = blockDim.x * blockIdx.x + threadIdx.x;
355 const size_t y = blockDim.y * blockIdx.y + threadIdx.y;
356 const size_t z = blockDim.z * blockIdx.z + threadIdx.z;
358 const size_t tmpIndex = x + sizeX * y + sizeX * sizeY * z;
360 if (x >= sizeX || y >= sizeY || z >= sizeZ)
369 output[tmpIndex] += input[tmpIndex + sizeX] - input[tmpIndex];
374 output[tmpIndex] -= input[tmpIndex + sizeX] - input[tmpIndex];
379 output[tmpIndex] = input[tmpIndex + sizeX] - input[tmpIndex];
387 __global__
void dyp3dTransposedCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
const size_t sizeZ,
mySign s)
389 const size_t x = blockDim.x * blockIdx.x + threadIdx.x;
390 const size_t y = blockDim.y * blockIdx.y + threadIdx.y;
391 const size_t z = blockDim.z * blockIdx.z + threadIdx.z;
393 const size_t tmpIndex = x + sizeX * y + sizeX * sizeY * z;
395 if (x >= sizeX || y >= sizeY || z >= sizeZ)
398 if (y < sizeY - 1 && y > 0)
404 output[tmpIndex] += -(input[tmpIndex] - input[tmpIndex - sizeX]);
409 output[tmpIndex] -= -(input[tmpIndex] - input[tmpIndex - sizeX]);
414 output[tmpIndex] = -(input[tmpIndex] - input[tmpIndex - sizeX]);
425 output[tmpIndex] += -(input[tmpIndex]);
430 output[tmpIndex] -= -(input[tmpIndex]);
435 output[tmpIndex] = -(input[tmpIndex]);
446 output[tmpIndex] += (input[tmpIndex - sizeX]);
451 output[tmpIndex] -= (input[tmpIndex - sizeX]);
456 output[tmpIndex] = (input[tmpIndex - sizeX]);
464 __global__
void dzp3dCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
const size_t sizeZ,
mySign s)
466 const size_t x = blockDim.x * blockIdx.x + threadIdx.x;
467 const size_t y = blockDim.y * blockIdx.y + threadIdx.y;
468 const size_t z = blockDim.z * blockIdx.z + threadIdx.z;
470 const size_t tmpIndex = x + sizeX * y + sizeX * sizeY * z;
472 if (x >= sizeX || y >= sizeY || z >= sizeZ)
481 output[tmpIndex] += input[tmpIndex + sizeX * sizeY] - input[tmpIndex];
486 output[tmpIndex] -= input[tmpIndex + sizeX * sizeY] - input[tmpIndex];
491 output[tmpIndex] = input[tmpIndex + sizeX * sizeY] - input[tmpIndex];
499 __global__
void dzp3dTransposedCUDA(T* output,
const T* input,
const size_t sizeX,
const size_t sizeY,
const size_t sizeZ,
mySign s)
501 const size_t x = blockDim.x * blockIdx.x + threadIdx.x;
502 const size_t y = blockDim.y * blockIdx.y + threadIdx.y;
503 const size_t z = blockDim.z * blockIdx.z + threadIdx.z;
505 const size_t tmpIndex = x + sizeX * y + sizeX * sizeY * z;
507 if (x >= sizeX || y >= sizeY || z >= sizeZ)
510 if (z < sizeZ - 1 && z > 0)
516 output[tmpIndex] += -(input[tmpIndex] - input[tmpIndex - sizeX * sizeY]);
521 output[tmpIndex] -= -(input[tmpIndex] - input[tmpIndex - sizeX * sizeY]);
526 output[tmpIndex] = -(input[tmpIndex] - input[tmpIndex - sizeX * sizeY]);
537 output[tmpIndex] += -(input[tmpIndex]);
542 output[tmpIndex] -= -(input[tmpIndex]);
547 output[tmpIndex] = -(input[tmpIndex]);
558 output[tmpIndex] += (input[tmpIndex - sizeX * sizeY]);
563 output[tmpIndex] -= (input[tmpIndex - sizeX * sizeY]);
568 output[tmpIndex] = (input[tmpIndex - sizeX * sizeY]);
583 typedef thrust::device_vector<T> Tdata;
585 typedef std::vector<T> Tdata;
589 std::vector<int> inputDimension;
592 int numberDimensions;
604 inputDimension(AInputDimension),
605 gradDirection(aGradDirection),
607 numberDimensions(static_cast<int>(AInputDimension.size())),
flexLinearOperator<T>(vectorProduct(AInputDimension), vectorProduct(AInputDimension), gradientOp, aMinus)
612 std::vector<int> dimsCopy;
613 dimsCopy.resize(this->inputDimension.size());
615 std::copy(this->inputDimension.begin(), this->inputDimension.end(), dimsCopy.begin());
626 void updateValue(T* ptr,
mySign s, T value)
648 void dxp3d(
const Tdata &input, Tdata &output,
mySign s)
650 int sizeZ = this->inputDimension[2];
651 int sizeY = this->inputDimension[1];
652 int sizeX = this->inputDimension[0] - 1;
654 #pragma omp parallel for 655 for (
int k = 0; k < sizeZ; ++k)
657 for (
int j = 0; j < sizeY; ++j)
659 for (
int i = 0; i < sizeX; ++i)
661 const int tmpIndex = this->index3DtoLinear(i, j, k);
663 this->updateValue(&output[tmpIndex], s, input[tmpIndex + 1] - input[tmpIndex]);
669 void dyp3d(
const Tdata &input, Tdata &output,
mySign s)
671 int sizeZ = this->inputDimension[2];
672 int sizeY = this->inputDimension[1] - 1;
673 int sizeX = this->inputDimension[0];
675 #pragma omp parallel for 676 for (
int k = 0; k < sizeZ; ++k)
678 for (
int j = 0; j < sizeY; ++j)
680 for (
int i = 0; i < sizeX; ++i)
682 const int tmpIndex1 = this->index3DtoLinear(i, j, k);
683 const int tmpIndex2 = this->index3DtoLinear(i, j + 1, k);
685 this->updateValue(&output[tmpIndex1], s, input[tmpIndex2] - input[tmpIndex1]);
691 void dzp3d(
const Tdata &input, Tdata &output,
mySign s)
693 int sizeZ = this->inputDimension[2] - 1;
694 int sizeY = this->inputDimension[1];
695 int sizeX = this->inputDimension[0];
697 #pragma omp parallel for 698 for (
int k = 0; k < sizeZ; ++k)
700 for (
int j = 0; j < sizeY; ++j)
702 for (
int i = 0; i < sizeX; ++i)
704 const int tmpIndex1 = this->index3DtoLinear(i, j, k);
705 const int tmpIndex2 = this->index3DtoLinear(i, j, k + 1);
707 this->updateValue(&output[tmpIndex1], s, input[tmpIndex2] - input[tmpIndex1]);
713 void dxp3dTransposed(
const Tdata &input, Tdata &output,
mySign s)
715 const int sizeZ = this->inputDimension[2];
716 const int sizeY = this->inputDimension[1];
717 const int sizeX = this->inputDimension[0] - 1;
719 #pragma omp parallel for 720 for (
int k = 0; k < sizeZ; ++k)
722 for (
int j = 0; j < sizeY; ++j)
724 for (
int i = 1; i < sizeX; ++i)
726 int tmpIndex = this->index3DtoLinear(i, j, k);
728 this->updateValue(&output[tmpIndex], s, -(input[tmpIndex] - input[tmpIndex - 1]));
733 #pragma omp parallel for 734 for (
int k = 0; k < this->inputDimension[2]; ++k)
736 for (
int j = 0; j < this->inputDimension[1]; ++j)
738 const int index1 = this->index3DtoLinear(0, j, k);
739 const int index2 = this->index3DtoLinear(this->inputDimension[0] - 1, j, k);
740 const int index3 = this->index3DtoLinear(this->inputDimension[0] - 2, j, k);
742 this->updateValue(&output[index1], s, -input[index1]);
743 this->updateValue(&output[index2], s, input[index3]);
748 void dyp3dTransposed(
const Tdata &input, Tdata &output,
mySign s)
750 const int sizeZ = this->inputDimension[2];
751 const int sizeY = this->inputDimension[1] - 1;
752 const int sizeX = this->inputDimension[0];
754 #pragma omp parallel for 755 for (
int k = 0; k < sizeZ; ++k)
757 for (
int j = 1; j < sizeY; ++j)
759 for (
int i = 0; i < sizeX; ++i)
761 const int tmpIndex1 = this->index3DtoLinear(i, j, k);
762 const int tmpIndex2 = this->index3DtoLinear(i, j - 1, k);
764 this->updateValue(&output[tmpIndex1], s, -(input[tmpIndex1] - input[tmpIndex2]));
769 #pragma omp parallel for 770 for (
int k = 0; k < this->inputDimension[2]; ++k)
772 for (
int i = 0; i < this->inputDimension[0]; ++i)
774 const int index1 = this->index3DtoLinear(i, 0, k);
775 const int index2 = this->index3DtoLinear(i, this->inputDimension[1] - 1, k);
776 const int index3 = this->index3DtoLinear(i, this->inputDimension[1] - 2, k);
778 this->updateValue(&output[index1], s, -input[index1]);
779 this->updateValue(&output[index2], s, input[index3]);
784 void dzp3dTransposed(
const Tdata &input, Tdata &output,
mySign s)
786 const int sizeZ = this->inputDimension[2] - 1;
787 const int sizeY = this->inputDimension[1];
788 const int sizeX = this->inputDimension[0];
790 #pragma omp parallel for 791 for (
int k = 1; k < sizeZ; ++k)
793 for (
int j = 0; j < sizeY; ++j)
795 for (
int i = 0; i < sizeX; ++i)
797 const int tmpIndex1 = this->index3DtoLinear(i, j, k);
798 const int tmpIndex2 = this->index3DtoLinear(i, j, k - 1);
800 this->updateValue(&output[tmpIndex1], s, -(input[tmpIndex1] - input[tmpIndex2]));
805 #pragma omp parallel for 806 for (
int j = 0; j < this->inputDimension[1]; ++j)
808 for (
int i = 0; i < this->inputDimension[0]; ++i)
810 const int index1 = this->index3DtoLinear(i, j, 0);
811 const int index2 = this->index3DtoLinear(i, j, this->inputDimension[2] - 1);
812 const int index3 = this->index3DtoLinear(i, j, this->inputDimension[2] - 2);
814 this->updateValue(&output[index1], s, -input[index1]);
815 this->updateValue(&output[index2], s, input[index3]);
821 void dxp2d(
const Tdata &input, Tdata &output,
mySign s)
823 int sizeY = this->inputDimension[1];
824 int sizeX = this->inputDimension[0] - 1;
826 #pragma omp parallel for 827 for (
int j = 0; j < sizeY; ++j)
829 for (
int i = 0; i < sizeX; ++i)
831 const int tmpIndex = this->index2DtoLinear(i, j);
837 output[tmpIndex] += input[tmpIndex + 1] - input[tmpIndex];
842 output[tmpIndex] -= input[tmpIndex + 1] - input[tmpIndex];
847 output[tmpIndex] = input[tmpIndex + 1] - input[tmpIndex];
855 void dyp2d(
const Tdata &input, Tdata &output,
mySign s)
857 int sizeY = this->inputDimension[1] - 1;
858 int sizeX = this->inputDimension[0];
860 #pragma omp parallel for 861 for (
int j = 0; j < sizeY; ++j)
863 for (
int i = 0; i < sizeX; ++i)
865 const int tmpIndex = this->index2DtoLinear(i, j);
871 output[tmpIndex] += input[tmpIndex + sizeX] - input[tmpIndex];
876 output[tmpIndex] -= input[tmpIndex + sizeX] - input[tmpIndex];
881 output[tmpIndex] = input[tmpIndex + sizeX] - input[tmpIndex];
889 void dxp2dTransposed(
const Tdata &input, Tdata &output,
mySign s)
891 int sizeY = this->inputDimension[1];
892 int sizeX = this->inputDimension[0] - 1;
894 #pragma omp parallel for 895 for (
int j = 0; j < sizeY; ++j)
897 for (
int i = 1; i < sizeX; ++i)
899 int tmpIndex = this->index2DtoLinear(i, j);
905 output[tmpIndex] += -(input[tmpIndex] - input[tmpIndex - 1]);
910 output[tmpIndex] -= -(input[tmpIndex] - input[tmpIndex - 1]);
915 output[tmpIndex] = -(input[tmpIndex] - input[tmpIndex - 1]);
922 for (
int j = 0; j < this->inputDimension[1]; ++j)
928 output[this->index2DtoLinear(0, j)] += -input[this->index2DtoLinear(0, j)];
929 output[this->index2DtoLinear(this->inputDimension[0] - 1, j)] += input[this->index2DtoLinear(this->inputDimension[0] - 2, j)];
934 output[this->index2DtoLinear(0, j)] -= -input[this->index2DtoLinear(0, j)];
935 output[this->index2DtoLinear(this->inputDimension[0] - 1, j)] -= input[this->index2DtoLinear(this->inputDimension[0] - 2, j)];
940 output[this->index2DtoLinear(0, j)] = -input[this->index2DtoLinear(0, j)];
941 output[this->index2DtoLinear(this->inputDimension[0] - 1, j)] = input[this->index2DtoLinear(this->inputDimension[0] - 2, j)];
948 void dyp2dTransposed(
const Tdata &input, Tdata &output,
mySign s)
950 int sizeY = this->inputDimension[1] - 1;
951 int sizeX = this->inputDimension[0];
953 #pragma omp parallel for 954 for (
int j = 1; j < sizeY; ++j)
956 for (
int i = 0; i < sizeX; ++i)
958 int tmpIndex = this->index2DtoLinear(i, j);
964 output[tmpIndex] += -(input[tmpIndex] - input[tmpIndex - sizeX]);
969 output[tmpIndex] -= -(input[tmpIndex] - input[tmpIndex - sizeX]);
974 output[tmpIndex] = -(input[tmpIndex] - input[tmpIndex - sizeX]);
981 for (
int i = 0; i < this->inputDimension[0]; ++i)
987 output[this->index2DtoLinear(i, 0)] += -input[this->index2DtoLinear(i, 0)];
988 output[this->index2DtoLinear(i, this->inputDimension[1] - 1)] += input[this->index2DtoLinear(i, this->inputDimension[1] - 2)];
993 output[this->index2DtoLinear(i, 0)] -= -input[this->index2DtoLinear(i, 0)];
994 output[this->index2DtoLinear(i, this->inputDimension[1] - 1)] -= input[this->index2DtoLinear(i, this->inputDimension[1] - 2)];
999 output[this->index2DtoLinear(i, 0)] = -input[this->index2DtoLinear(i, 0)];
1000 output[this->index2DtoLinear(i, this->inputDimension[1] - 1)] = input[this->index2DtoLinear(i, this->inputDimension[1] - 2)];
1007 void doTimesCPU(
bool transposed,
const Tdata &input, Tdata &output,
mySign s)
1009 if (this->inputDimension.size() == 2)
1011 if (this->gradDirection == 0)
1013 if (transposed ==
false)
1015 this->dxp2d(input, output, s);
1019 this->dxp2dTransposed(input, output, s);
1022 else if (this->gradDirection == 1)
1024 if (transposed ==
false)
1026 this->dyp2d(input, output, s);
1030 this->dyp2dTransposed(input, output, s);
1034 else if (this->inputDimension.size() == 3)
1036 if (this->gradDirection == 0)
1038 if (transposed ==
false)
1040 this->dxp3d(input, output, s);
1044 this->dxp3dTransposed(input, output, s);
1047 else if (this->gradDirection == 1)
1049 if (transposed ==
false)
1051 this->dyp3d(input, output, s);
1055 this->dyp3dTransposed(input, output, s);
1058 else if (this->gradDirection == 2)
1060 if (transposed ==
false)
1062 this->dzp3d(input, output, s);
1066 this->dzp3dTransposed(input, output, s);
1072 printf(
"Gradient not implemented for dim!={2,3}\n");
1078 void doTimesCUDA(
bool transposed,
const Tdata &input, Tdata &output,
mySign s)
1080 size_t sizeX = this->inputDimension[0];
1084 if (this->inputDimension.size() > 1)
1086 sizeY = this->inputDimension[1];
1088 if (this->inputDimension.size() > 2)
1090 sizeZ = this->inputDimension[2];
1093 dim3 block = dim3(32,16,1);
1094 dim3 grid = dim3((sizeX + block.x - 1) / block.x, (sizeY + block.y - 1) / block.y, (sizeZ + block.z - 1) / block.z);
1096 T* ptrOutput = thrust::raw_pointer_cast(output.data());
1097 const T* ptrInput = thrust::raw_pointer_cast(input.data());
1099 if (this->inputDimension.size() == 2)
1101 if (this->gradDirection == 0)
1103 if (transposed ==
false)
1105 dxp2dCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], s);
1109 dxp2dTransposedCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], s);
1112 else if (this->gradDirection == 1)
1114 if (transposed ==
false)
1116 dyp2dCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], s);
1120 dyp2dTransposedCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], s);
1124 else if (this->inputDimension.size() == 3)
1126 if (this->gradDirection == 0)
1128 if (transposed ==
false)
1130 dxp3dCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], this->inputDimension[2], s);
1134 dxp3dTransposedCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], this->inputDimension[2], s);
1137 else if (this->gradDirection == 1)
1139 if (transposed ==
false)
1141 dyp3dCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], this->inputDimension[2], s);
1145 dyp3dTransposedCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], this->inputDimension[2], s);
1148 else if (this->gradDirection == 2)
1150 if (transposed ==
false)
1152 dzp3dCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], this->inputDimension[2], s);
1156 dzp3dTransposedCUDA << <grid, block >> >(ptrOutput, ptrInput, this->inputDimension[0], this->inputDimension[1], this->inputDimension[2], s);
1162 printf(
"Gradient not implemented for dim!={2,3}\n");
1168 void doTimes(
bool transposed,
const Tdata &input, Tdata &output,
mySign s)
1170 if (this->
isMinus && s == PLUS)
1174 else if (this->
isMinus && s == MINUS)
1180 if (this->type == backward)
1182 transposed = !transposed;
1194 this->doTimesCUDA(transposed, input, output, s);
1196 this->doTimesCPU(transposed, input, output, s);
1200 void timesPlus(
bool transposed,
const Tdata &input, Tdata &output)
1202 this->doTimes(transposed, input, output, PLUS);
1205 void timesMinus(
bool transposed,
const Tdata &input, Tdata &output)
1207 this->doTimes(transposed, input, output, MINUS);
1210 void times(
bool transposed,
const Tdata &input, Tdata &output)
1212 this->doTimes(transposed, input, output, EQUALS);
1218 return static_cast<T
>(2);
1223 std::vector<T> result(this->
getNumRows(),(T)2);
1228 int index3DtoLinear(
int i,
int j,
int k)
1230 return (i + j*this->inputDimension[0] + k*this->inputDimension[0] * this->inputDimension[1]);
1233 int index2DtoLinear(
int i,
int j)
1235 return (i + j*this->inputDimension[0]);
1241 thrust::device_vector<T> result(this->
getNumRows(), (T)2);
int getNumRows() const
returns number of rows of the linear operator
Definition: flexLinearOperator.h:57
flexGradientOperator(std::vector< int > AInputDimension, int aGradDirection, gradientType aType, bool aMinus)
initializes the gradient operator
Definition: flexGradientOperator.h:603
represents a gradient operator
Definition: flexGradientOperator.h:579
bool isMinus
determines if operator is negated
Definition: flexLinearOperator.h:25
void times(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector
Definition: flexGradientOperator.h:1210
flexGradientOperator< T > * copy()
copies the linear operator
Definition: flexGradientOperator.h:610
T getMaxRowSumAbs(bool transposed)
returns the maximum sum of absolute values per row used for preconditioning
Definition: flexGradientOperator.h:1215
std::vector< T > getAbsRowSum(bool transposed)
returns a vector of sum of absolute values per row used for preconditioning
Definition: flexGradientOperator.h:1221
void timesMinus(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector and substracts its result from y
Definition: flexGradientOperator.h:1205
abstract base class for linear operators
Definition: flexLinearOperator.h:12
thrust::device_vector< T > getAbsRowSumCUDA(bool transposed)
same function as getAbsRowSum() but implemented in CUDA
Definition: flexGradientOperator.h:1239
void timesPlus(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector and adds its result to y
Definition: flexGradientOperator.h:1200