FlexBox - A Flexible Primal-Dual ToolBox
flexMatrixGPU.h
1 #ifndef flexMatrixGPU_H
2 #define flexMatrixGPU_H
3 
4 #include <cuda_runtime.h>
5 #include <cusparse_v2.h>
6 
7 #include <vector>
8 
9 #include "flexLinearOperator.h"
10 
12 template<typename T>
14 {
15 
16 #ifdef __CUDACC__
17  typedef thrust::device_vector<T> Tdata;
18 #else
19  typedef std::vector<T> Tdata;
20 #endif
21 
22 private:
23  cusparseHandle_t handle;
24  cusparseMatDescr_t descrA;
25 
26  int* listRowEntries;
27  int* listColIndices;
28  T* listValues;
29 
30  int nnz;
31 
32 public:
34 
43  flexMatrixGPU(int aNumRows, int aNumCols, int* rowList, int* colList, T* indexVal, bool formatCRS, bool aMinus) : flexLinearOperator<T>(aNumRows, aNumCols, matrixGPUOp, aMinus)
44  {
45  //create sparse matrix
46  cusparseCreate(&this->handle);
47  cusparseCreateMatDescr(&this->descrA);
48 
49  cusparseSetMatType(this->descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
50  cusparseSetMatIndexBase(this->descrA, CUSPARSE_INDEX_BASE_ZERO);
51 
52  //if formatCRS is true then the input data is already in compressed row storage format, otherwise we have to convert it
53  if (formatCRS)
54  {
55  this->nnz = rowList[aNumRows]; //access last entry
56  }
57  else
58  {
59  this->nnz = colList[aNumCols]; //access last entry
60  }
61 
62  cudaMalloc(&this->listValues, this->nnz * sizeof(T));
63  cudaMalloc(&this->listColIndices, this->nnz * sizeof(int));
64  cudaMalloc(&this->listRowEntries, (aNumRows + 1) * sizeof(int));
65 
66  if (formatCRS == false)
67  {
68  //copy input to device memory
69  T* listValuesTmp;
70  int* listColIndicesTmp;
71  int* listRowEntriesTmp;
72 
73  cudaMalloc(&listValuesTmp, this->nnz * sizeof(T));
74  cudaMemcpy(listValuesTmp, indexVal, this->nnz * sizeof(T), cudaMemcpyHostToDevice);
75  cudaMalloc(&listColIndicesTmp, (aNumCols + 1) * sizeof(int));
76  cudaMemcpy(listColIndicesTmp, colList, (aNumCols + 1) * sizeof(int), cudaMemcpyHostToDevice);
77  cudaMalloc(&listRowEntriesTmp, this->nnz * sizeof(int));
78  cudaMemcpy(listRowEntriesTmp, rowList, this->nnz * sizeof(int), cudaMemcpyHostToDevice);
79 
80 
81  cudaDeviceSynchronize();
82  cusparseStatus_t status = cusparseScsr2csc(this->handle, aNumCols, aNumRows, this->nnz, listValuesTmp, listColIndicesTmp, listRowEntriesTmp, this->listValues, this->listColIndices, this->listRowEntries, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
83  cudaDeviceSynchronize();
84 
85  cudaFree(listValuesTmp);
86  cudaFree(listColIndicesTmp);
87  cudaFree(listRowEntriesTmp);
88 
89  if (VERBOSE > 0)
90  {
91  switch (status)
92  {
93  case CUSPARSE_STATUS_SUCCESS:
94  {
95  printf("Copy was successfull\n");
96  break;
97  }
98  case CUSPARSE_STATUS_NOT_INITIALIZED:
99  {
100  printf("the library was not initialized\n");
101  break;
102  }
103  case CUSPARSE_STATUS_ALLOC_FAILED:
104  {
105  printf("the resources could not be allocated\n");
106  break;
107  }
108  case CUSPARSE_STATUS_INVALID_VALUE:
109  {
110  printf("invalid parameters were passed(m, n, nnz<0)\n");
111  break;
112  }
113  case CUSPARSE_STATUS_ARCH_MISMATCH:
114  {
115  printf("the device does not support double precision\n");
116  break;
117  }
118  case CUSPARSE_STATUS_EXECUTION_FAILED:
119  {
120  printf("the function failed to launch on the GPU\n");
121  break;
122  }
123  case CUSPARSE_STATUS_INTERNAL_ERROR:
124  {
125  printf("the function failed to launch on the GPU\n");
126  break;
127  }
128  default:
129  {
130  printf("Error Copy!");
131  break;
132  }
133  }
134  }
135  }
136  else
137  {
138  cudaMemcpy(this->listValues, indexVal, this->nnz * sizeof(T), cudaMemcpyHostToDevice);
139  cudaMemcpy(this->listColIndices, colList, this->nnz * sizeof(int), cudaMemcpyHostToDevice);
140  cudaMemcpy(this->listRowEntries, rowList, (aNumRows + 1) * sizeof(int), cudaMemcpyHostToDevice);
141  }
142  };
143 
144  ~flexMatrixGPU()
145  {
146  if (VERBOSE > 0) printf("MatrixGPU destructor!");
147  //free cuda memory
148  cudaFree(this->listValues);
149  cudaFree(this->listColIndices);
150  cudaFree(this->listRowEntries);
151  }
152 
154  {
155  //copy matrix data to host
156 
157  //allocate memory
158  T *hostValues = (T *)malloc(this->nnz * sizeof(T));
159  int *hostRowIndices = (int *)malloc((this->getNumRows() + 1) * sizeof(int));
160  int *hostColIndices = (int *)malloc(this->nnz * sizeof(int));
161 
162  cudaMemcpy(hostValues, this->listValues, this->nnz * sizeof(T), cudaMemcpyDeviceToHost);
163  cudaMemcpy(hostRowIndices, this->listRowEntries, (this->getNumRows() + 1) * sizeof(int), cudaMemcpyDeviceToHost);
164  cudaMemcpy(hostColIndices, this->listColIndices, this->nnz * sizeof(int), cudaMemcpyDeviceToHost);
165 
166  flexMatrixGPU<T>* A = new flexMatrixGPU<T>(this->getNumRows(), this->getNumCols(), hostRowIndices, hostColIndices, hostValues,true, this->isMinus);
167 
168  free(hostValues);
169  free(hostRowIndices);
170  free(hostColIndices);
171 
172  return A;
173  }
174 
175  void times(bool transposed, const Tdata &input, Tdata &output)
176  {
177  T alpha;
178 
179  if (this->isMinus)
180  {
181  alpha = (T)-1;
182  }
183  else
184  {
185  alpha = (T)1;
186  }
187 
188  const T beta = (T)0;
189 
190  T* ptrOutput = thrust::raw_pointer_cast(output.data());
191  const T* ptrInput = thrust::raw_pointer_cast(input.data());
192 
193  if (transposed == false)
194  {
195  cusparseScsrmv(this->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, this->getNumRows(), this->getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
196  }
197  else
198  {
199  cusparseScsrmv(this->handle, CUSPARSE_OPERATION_TRANSPOSE, this->getNumRows(), this->getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
200  }
201  }
202 
203  void timesPlus(bool transposed, const Tdata &input, Tdata &output)
204  {
205  T alpha;
206 
207  if (this->isMinus)
208  {
209  alpha = (T)-1;
210  }
211  else
212  {
213  alpha = (T)1;
214  }
215 
216  const T beta = (T)1;
217 
218  T* ptrOutput = thrust::raw_pointer_cast(output.data());
219  const T* ptrInput = thrust::raw_pointer_cast(input.data());
220 
221  if (transposed == false)
222  {
223  cusparseScsrmv(this->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, this->getNumRows(), this->getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
224  }
225  else
226  {
227  cusparseScsrmv(this->handle, CUSPARSE_OPERATION_TRANSPOSE, this->getNumRows(), this->getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
228  }
229  }
230 
231  void timesMinus(bool transposed, const Tdata &input, Tdata &output)
232  {
233  T alpha;
234 
235  if (this->isMinus)
236  {
237  alpha = (T)1;
238  }
239  else
240  {
241  alpha = (T)-1;
242  }
243  const T beta = (T)1;
244 
245  T* ptrOutput = thrust::raw_pointer_cast(output.data());
246  const T* ptrInput = thrust::raw_pointer_cast(input.data());
247 
248  if (transposed == false)
249  {
250  cusparseScsrmv(this->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, this->getNumRows(), this->getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
251  }
252  else
253  {
254  cusparseScsrmv(this->handle, CUSPARSE_OPERATION_TRANSPOSE, this->getNumRows(), this->getNumCols(), nnz, &alpha, this->descrA, this->listValues, this->listRowEntries, this->listColIndices, ptrInput, &beta, ptrOutput);
255  }
256  }
257 
258  //TODO: implement. 1 is just placeholder
259  T getMaxRowSumAbs(bool transposed)
260  {
261  return 1;
262  }
263 
264  //dummy, this function is not used in a CUDA setting
265  std::vector<T> getAbsRowSum(bool transposed)
266  {
267  std::vector<T> result(1);
268  return result;
269  }
270 
272 
275  void printRow(int i)
276  {
277 
278  }
279 
281  void printMatrix()
282  {
283  T *hostValues = (T *)malloc(this->nnz * sizeof(T));
284  int *hostRowIndices = (int *)malloc((this->getNumRows() + 1) * sizeof(int));
285  int *hostColIndices = (int *)malloc(this->nnz * sizeof(int));
286 
287  cudaMemcpy(hostValues, this->listValues, this->nnz * sizeof(T), cudaMemcpyDeviceToHost);
288  cudaMemcpy(hostRowIndices, this->listRowEntries, (this->getNumRows() + 1) * sizeof(int), cudaMemcpyDeviceToHost);
289  cudaMemcpy(hostColIndices, this->listColIndices, this->nnz * sizeof(int), cudaMemcpyDeviceToHost);
290 
291  cudaDeviceSynchronize();
292 
293  free(hostValues);
294  free(hostRowIndices);
295  free(hostColIndices);
296  }
297 
298  thrust::device_vector<T> getAbsRowSumCUDA(bool transposed)
299  {
300  std::vector<T> resultTmp;
301 
302  if (transposed == false)
303  {
304  resultTmp.resize(this->getNumRows());
305  }
306  else
307  {
308  resultTmp.resize(this->getNumCols());
309  }
310 
311  //allocate memory
312  T *hostValues = (T *)malloc(this->nnz * sizeof(T));
313  int *hostRowIndices = (int *)malloc((this->getNumRows() + 1) * sizeof(int));
314  int *hostColIndices = (int *)malloc(this->nnz * sizeof(int));
315 
316  cudaMemcpy(hostValues, this->listValues, this->nnz * sizeof(T), cudaMemcpyDeviceToHost);
317  cudaMemcpy(hostRowIndices, this->listRowEntries, (this->getNumRows() + 1) * sizeof(int), cudaMemcpyDeviceToHost);
318  cudaMemcpy(hostColIndices, this->listColIndices, this->nnz * sizeof(int), cudaMemcpyDeviceToHost);
319 
320  for (int row = 0; row < this->getNumRows(); row++)
321  {
322  int starting_col_index = hostRowIndices[row];
323  int stopping_col_index = hostRowIndices[row + 1];
324  if (starting_col_index == stopping_col_index)
325  continue;
326  else
327  {
328  for (int current_col_index = starting_col_index; current_col_index < stopping_col_index; current_col_index++)
329  {
330  if (transposed == false)
331  {
332  resultTmp[row] += std::abs(hostValues[current_col_index]);
333  }
334  else
335  {
336  resultTmp[hostColIndices[current_col_index]] += std::abs(hostValues[current_col_index]);
337  }
338  }
339  }
340  }
341 
342  free(hostValues);
343  free(hostRowIndices);
344  free(hostColIndices);
345 
346  Tdata result(resultTmp.size());
347 
348  thrust::copy(resultTmp.begin(), resultTmp.end(), result.begin());
349 
350  return result;
351  }
352 };
353 
354 #endif
int getNumRows() const
returns number of rows of the linear operator
Definition: flexLinearOperator.h:57
bool isMinus
determines if operator is negated
Definition: flexLinearOperator.h:25
void times(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector
Definition: flexMatrixGPU.h:175
flexMatrixGPU< T > * copy()
copies the linear operator
Definition: flexMatrixGPU.h:153
int getNumCols() const
returns number of columns of the linear operator
Definition: flexLinearOperator.h:48
void timesMinus(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector and substracts its result from y
Definition: flexMatrixGPU.h:231
void printRow(int i)
prints requested row
Definition: flexMatrixGPU.h:275
void timesPlus(bool transposed, const Tdata &input, Tdata &output)
applies linear operator on vector and adds its result to y
Definition: flexMatrixGPU.h:203
void printMatrix()
prints the whole matrix
Definition: flexMatrixGPU.h:281
represents a (CUDA) matrix
Definition: flexMatrixGPU.h:13
T getMaxRowSumAbs(bool transposed)
returns the maximum sum of absolute values per row used for preconditioning
Definition: flexMatrixGPU.h:259
flexMatrixGPU(int aNumRows, int aNumCols, int *rowList, int *colList, T *indexVal, bool formatCRS, bool aMinus)
initializes a matrix
Definition: flexMatrixGPU.h:43
thrust::device_vector< T > getAbsRowSumCUDA(bool transposed)
same function as getAbsRowSum() but implemented in CUDA
Definition: flexMatrixGPU.h:298
abstract base class for linear operators
Definition: flexLinearOperator.h:12
std::vector< T > getAbsRowSum(bool transposed)
returns a vector of sum of absolute values per row used for preconditioning
Definition: flexMatrixGPU.h:265