1 #ifndef flexMatrixGPU_H 2 #define flexMatrixGPU_H 4 #include <cuda_runtime.h> 5 #include <cusparse_v2.h> 9 #include "flexLinearOperator.h" 17 typedef thrust::device_vector<T> Tdata;
19 typedef std::vector<T> Tdata;
23 cusparseHandle_t handle;
24 cusparseMatDescr_t descrA;
43 flexMatrixGPU(
int aNumRows,
int aNumCols,
int* rowList,
int* colList, T* indexVal,
bool formatCRS,
bool aMinus) :
flexLinearOperator<T>(aNumRows, aNumCols, matrixGPUOp, aMinus)
46 cusparseCreate(&this->handle);
47 cusparseCreateMatDescr(&this->descrA);
49 cusparseSetMatType(this->descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
50 cusparseSetMatIndexBase(this->descrA, CUSPARSE_INDEX_BASE_ZERO);
55 this->nnz = rowList[aNumRows];
59 this->nnz = colList[aNumCols];
62 cudaMalloc(&this->listValues, this->nnz *
sizeof(T));
63 cudaMalloc(&this->listColIndices, this->nnz *
sizeof(
int));
64 cudaMalloc(&this->listRowEntries, (aNumRows + 1) *
sizeof(
int));
66 if (formatCRS ==
false)
70 int* listColIndicesTmp;
71 int* listRowEntriesTmp;
73 cudaMalloc(&listValuesTmp, this->nnz *
sizeof(T));
74 cudaMemcpy(listValuesTmp, indexVal, this->nnz *
sizeof(T), cudaMemcpyHostToDevice);
75 cudaMalloc(&listColIndicesTmp, (aNumCols + 1) *
sizeof(
int));
76 cudaMemcpy(listColIndicesTmp, colList, (aNumCols + 1) *
sizeof(
int), cudaMemcpyHostToDevice);
77 cudaMalloc(&listRowEntriesTmp, this->nnz *
sizeof(
int));
78 cudaMemcpy(listRowEntriesTmp, rowList, this->nnz *
sizeof(
int), cudaMemcpyHostToDevice);
81 cudaDeviceSynchronize();
82 cusparseStatus_t status = cusparseScsr2csc(this->handle, aNumCols, aNumRows, this->nnz, listValuesTmp, listColIndicesTmp, listRowEntriesTmp, this->listValues, this->listColIndices, this->listRowEntries, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
83 cudaDeviceSynchronize();
85 cudaFree(listValuesTmp);
86 cudaFree(listColIndicesTmp);
87 cudaFree(listRowEntriesTmp);
93 case CUSPARSE_STATUS_SUCCESS:
95 printf(
"Copy was successfull\n");
98 case CUSPARSE_STATUS_NOT_INITIALIZED:
100 printf(
"the library was not initialized\n");
103 case CUSPARSE_STATUS_ALLOC_FAILED:
105 printf(
"the resources could not be allocated\n");
108 case CUSPARSE_STATUS_INVALID_VALUE:
110 printf(
"invalid parameters were passed(m, n, nnz<0)\n");
113 case CUSPARSE_STATUS_ARCH_MISMATCH:
115 printf(
"the device does not support double precision\n");
118 case CUSPARSE_STATUS_EXECUTION_FAILED:
120 printf(
"the function failed to launch on the GPU\n");
123 case CUSPARSE_STATUS_INTERNAL_ERROR:
125 printf(
"the function failed to launch on the GPU\n");
130 printf(
"Error Copy!");
138 cudaMemcpy(this->listValues, indexVal, this->nnz *
sizeof(T), cudaMemcpyHostToDevice);
139 cudaMemcpy(this->listColIndices, colList, this->nnz *
sizeof(
int), cudaMemcpyHostToDevice);
140 cudaMemcpy(this->listRowEntries, rowList, (aNumRows + 1) *
sizeof(
int), cudaMemcpyHostToDevice);
146 if (VERBOSE > 0) printf(
"MatrixGPU destructor!");
148 cudaFree(this->listValues);
149 cudaFree(this->listColIndices);
150 cudaFree(this->listRowEntries);
158 T *hostValues = (T *)malloc(this->nnz *
sizeof(T));
159 int *hostRowIndices = (
int *)malloc((this->
getNumRows() + 1) *
sizeof(
int));
160 int *hostColIndices = (
int *)malloc(this->nnz *
sizeof(
int));
162 cudaMemcpy(hostValues, this->listValues, this->nnz *
sizeof(T), cudaMemcpyDeviceToHost);
163 cudaMemcpy(hostRowIndices, this->listRowEntries, (this->
getNumRows() + 1) *
sizeof(
int), cudaMemcpyDeviceToHost);
164 cudaMemcpy(hostColIndices, this->listColIndices, this->nnz *
sizeof(
int), cudaMemcpyDeviceToHost);
169 free(hostRowIndices);
170 free(hostColIndices);
175 void times(
bool transposed,
const Tdata &input, Tdata &output)
190 T* ptrOutput = thrust::raw_pointer_cast(output.data());
191 const T* ptrInput = thrust::raw_pointer_cast(input.data());
193 if (transposed ==
false)
195 cusparseScsrmv(this->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, this->
getNumRows(), this->
getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
199 cusparseScsrmv(this->handle, CUSPARSE_OPERATION_TRANSPOSE, this->
getNumRows(), this->
getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
203 void timesPlus(
bool transposed,
const Tdata &input, Tdata &output)
218 T* ptrOutput = thrust::raw_pointer_cast(output.data());
219 const T* ptrInput = thrust::raw_pointer_cast(input.data());
221 if (transposed ==
false)
223 cusparseScsrmv(this->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, this->
getNumRows(), this->
getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
227 cusparseScsrmv(this->handle, CUSPARSE_OPERATION_TRANSPOSE, this->
getNumRows(), this->
getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
231 void timesMinus(
bool transposed,
const Tdata &input, Tdata &output)
245 T* ptrOutput = thrust::raw_pointer_cast(output.data());
246 const T* ptrInput = thrust::raw_pointer_cast(input.data());
248 if (transposed ==
false)
250 cusparseScsrmv(this->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, this->
getNumRows(), this->
getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
254 cusparseScsrmv(this->handle, CUSPARSE_OPERATION_TRANSPOSE, this->
getNumRows(), this->
getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
267 std::vector<T> result(1);
283 T *hostValues = (T *)malloc(this->nnz *
sizeof(T));
284 int *hostRowIndices = (
int *)malloc((this->
getNumRows() + 1) *
sizeof(
int));
285 int *hostColIndices = (
int *)malloc(this->nnz *
sizeof(
int));
287 cudaMemcpy(hostValues, this->listValues, this->nnz *
sizeof(T), cudaMemcpyDeviceToHost);
288 cudaMemcpy(hostRowIndices, this->listRowEntries, (this->
getNumRows() + 1) *
sizeof(
int), cudaMemcpyDeviceToHost);
289 cudaMemcpy(hostColIndices, this->listColIndices, this->nnz *
sizeof(
int), cudaMemcpyDeviceToHost);
291 cudaDeviceSynchronize();
294 free(hostRowIndices);
295 free(hostColIndices);
300 std::vector<T> resultTmp;
302 if (transposed ==
false)
312 T *hostValues = (T *)malloc(this->nnz *
sizeof(T));
313 int *hostRowIndices = (
int *)malloc((this->
getNumRows() + 1) *
sizeof(
int));
314 int *hostColIndices = (
int *)malloc(this->nnz *
sizeof(
int));
316 cudaMemcpy(hostValues, this->listValues, this->nnz *
sizeof(T), cudaMemcpyDeviceToHost);
317 cudaMemcpy(hostRowIndices, this->listRowEntries, (this->
getNumRows() + 1) *
sizeof(
int), cudaMemcpyDeviceToHost);
318 cudaMemcpy(hostColIndices, this->listColIndices, this->nnz *
sizeof(
int), cudaMemcpyDeviceToHost);
320 for (
int row = 0; row < this->
getNumRows(); row++)
322 int starting_col_index = hostRowIndices[row];
323 int stopping_col_index = hostRowIndices[row + 1];
324 if (starting_col_index == stopping_col_index)
328 for (
int current_col_index = starting_col_index; current_col_index < stopping_col_index; current_col_index++)
330 if (transposed ==
false)
332 resultTmp[row] += std::abs(hostValues[current_col_index]);
336 resultTmp[hostColIndices[current_col_index]] += std::abs(hostValues[current_col_index]);
343 free(hostRowIndices);
344 free(hostColIndices);
346 Tdata result(resultTmp.size());
348 thrust::copy(resultTmp.begin(), resultTmp.end(), result.begin());
int getNumRows() const
returns number of rows of the linear operator
Definition: flexLinearOperator.h:57
bool isMinus
determines if operator is negated
Definition: flexLinearOperator.h:25
void times(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector
Definition: flexMatrixGPU.h:175
flexMatrixGPU< T > * copy()
copies the linear operator
Definition: flexMatrixGPU.h:153
int getNumCols() const
returns number of columns of the linear operator
Definition: flexLinearOperator.h:48
void timesMinus(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector and substracts its result from y
Definition: flexMatrixGPU.h:231
void printRow(int i)
prints requested row
Definition: flexMatrixGPU.h:275
void timesPlus(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector and adds its result to y
Definition: flexMatrixGPU.h:203
void printMatrix()
prints the whole matrix
Definition: flexMatrixGPU.h:281
represents a (CUDA) matrix
Definition: flexMatrixGPU.h:13
T getMaxRowSumAbs(bool transposed)
returns the maximum sum of absolute values per row used for preconditioning
Definition: flexMatrixGPU.h:259
flexMatrixGPU(int aNumRows, int aNumCols, int *rowList, int *colList, T *indexVal, bool formatCRS, bool aMinus)
initializes a matrix
Definition: flexMatrixGPU.h:43
thrust::device_vector< T > getAbsRowSumCUDA(bool transposed)
same function as getAbsRowSum() but implemented in CUDA
Definition: flexMatrixGPU.h:298
abstract base class for linear operators
Definition: flexLinearOperator.h:12
std::vector< T > getAbsRowSum(bool transposed)
returns a vector of sum of absolute values per row used for preconditioning
Definition: flexMatrixGPU.h:265