cuda - Matrix columns permutation with cublas -


i have input matrix of size 10x20, want permute columns follows:

   p=[1 4 2 3 5 11 7 13 6 12 8 14 17 9 15 18 10 16 19 20] ;%rearrange columns of     a=a(:,p); 

to so, constructed permutation matrix corresponding permutation vector p , permuted can obtained performing following multiplication:

a=a*i 

i tested permutation in matlab , ok. now, want test in cuda using cublas.

the input matrix entered in column major. permuation matrix in column major well. following code test permutation:

#include "cuda_runtime.h" #include "device_launch_parameters.h"   #include <stdio.h> #include <stdlib.h> #include <math.h> #include <cublas_v2.h>   #define cudacall(call)                                                                                                          \                                                                                                                              \     {                                                                                                                           \         cudaerror_t err = (call);                                                                                               \         if(cudasuccess != err)                                                                                                  \         {                                                                                                                       \             fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err));    \             cudadevicereset();                                                                                                  \             exit(exit_failure);                                                                                                 \         }                                                                                                                       \     }                                                                                                                           \     while (0)  #define cublascall(call)                                                                                        \                                                                                                              \     {                                                                                                           \         cublasstatus_t status = (call);                                                                         \         if(cublas_status_success != status)                                                                     \         {                                                                                                       \             fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status);     \             cudadevicereset();                                                                                  \             exit(exit_failure);                                                                                 \         }                                                                                                       \                                                                                                                 \     }                                                                                                           \     while(0)      __global__ void sgemm_kernel(float *a_d, float *i_d)     {          int m=10,n=20,k=20;         int lda=k, ldb=k;         cublashandle_t hdl;         cublasstatus_t status = cublascreate_v2(&hdl);          const float alpha=1.0f, beta=0.0f;         status=cublassgemm(hdl,cublas_op_n,cublas_op_n,k,n,k,&alpha,a_d,lda,i_d,ldb,&beta,a_d,lda);      }      int main(int argc, char* argv[])     {float a[10*20]={-0.0614, -0.0199, 0.0024, -0.0414, 0.1736, -0.0595, -0.2794, 0.1946, -0.0647, -0.0025,      -0.0036, 0.0628, -0.0827, 0.3679, -0.1913, 0.0500, -0.0245, 0.3855, -0.1298, -0.0334,      -0.0241, -0.0564, 0.0098, -0.2862, -0.0474, 0.0333, -0.3049, 0.2851, -0.1242, 0.0162,      0.0241, 0.0270, -0.0670, 0.3129, -0.2428, 0.0947, -0.1878, 0.0889, -0.0208, 0.0075,      -0.1559, 0.1437, -0.1916, 0.2297, -0.0833, -0.1805, 0.2522, -0.1738, 0.1027, -0.1273,      0.0716, 0.1882, -0.0963, 0.1081, 0.0958, -0.0713, 0.1931, 0.0874, -0.4186, 0.0345,      -0.1912, 0.0501, -0.1396, -0.0989, -0.0338, 0.1773, 0.1088, 0.0389, -0.0117, 0.0014,      0.1648, -0.1705, -0.0575, -0.0133, -0.0570, 0.2124, -0.0193, 0.1535, 0.0857, -0.1308,      0.1971, 0.0882, -0.2577, 0.1662, -0.2498, -0.0365, -0.1805, 0.0921, 0.0912, 0.0178,      -0.0379, 0.0080, 0.0572, -0.0067, 0.0591, -0.0136, 0.0471, -0.0163, 0.0082, -0.0338,      -0.2436, 0.1116, 0.0732, -0.0319, 0.0550, 0.2821, 0.0240, 0.0109, -0.0034, 0.1212,      -0.0061, 0.2497, -0.0542, -0.0939, 0.0651, 0.0063, -0.1367, 0.0580, 0.7389, -0.1143,      -0.3786, 0.1288, 0.0001, 0.2604, -0.1094, -0.3624, -0.0184, 0.0538, 0.0329, 0.0040,      0.0603, 0.1422, 0.1037, -0.1846, 0.4046, -0.3738, -0.3487, 0.3846, -0.0849, 0.0135,      -0.1850, 0.3571, -0.0543, -0.0025, -0.2880, 0.0600, 0.2605, -0.0474, 0.0010, -0.0333,      -0.1974, 0.4788, -0.2441, 0.3847, -0.1235, -0.3503, -0.1785, -0.1095, 0.3158, 0.0062,      -0.0509, -0.0502, 0.2154, 0.2237, -0.0671, 0.0377, 0.0519, 0.1530, -0.1675, 0.1856,      -0.0380, -0.0026, 0.4700, 0.0097, -0.2394, 0.0717, -0.2101, 0.2841, -0.1799, -0.0924,      -0.2678, 0.4485, 0.0044, 0.0030, -0.0439, 0.4337, 0.1819, -0.0180, -0.5443, 0.0864,      0.0390, -0.0235, -0.0706, 0.0138, 0.0633, -0.0147, 0.0444, -0.0334, 0.0557, 0.0507}      float i[20*20]={1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};      float *a_d, *i_d;      cudacall(cudamalloc(&a_d,10*20*sizeof( float )));     cudacall(cudamalloc(&i_d, 20*20*sizeof(float  )));     cudacall(cudamemcpy(a_d, a, 10*20*sizeof(float), cudamemcpyhosttodevice));     cudacall(cudamemcpy(i_d, i, 20*20*sizeof(float), cudamemcpyhosttodevice));      sgemm_kernel<<<1,1>>>(a_d, i_d);     cudacall(cudadevicesynchronize());      cudacall(cudamemcpy(a, a_d, 10*20*sizeof(float), cudamemcpydevicetohost));     cudacall(cudafree(a_d));     cudacall(cudafree(i_d));       return 0;     } 

i couldn't correct result.

cublas doesn't support in-place operations (in fact no parallel blas aware of supports it). cannot pass a_d , use in multiplication , matrix in operation. must use different memory allocation hold result.

so

c <- 1*(a * b) + 0*c 

is legal, whereas

a <- 1*(a * b) + 0*a 

is not.


Comments

Popular posts from this blog

php - Wordpress website dashboard page or post editor content is not showing but front end data is showing properly -

javascript - Twitter Bootstrap - how to add some more margin between tooltip popup and element -

javascript - Get parameter of GET request -