cuda - Matrix columns permutation with cublas -
i have input matrix of size 10x20
, want permute columns follows:
p=[1 4 2 3 5 11 7 13 6 12 8 14 17 9 15 18 10 16 19 20] ;%rearrange columns of a=a(:,p);
to so, constructed permutation matrix corresponding permutation vector p , permuted can obtained performing following multiplication:
a=a*i
i tested permutation in matlab , ok. now, want test in cuda using cublas.
the input matrix entered in column major. permuation matrix in column major well. following code test permutation:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <stdlib.h> #include <math.h> #include <cublas_v2.h> #define cudacall(call) \ \ { \ cudaerror_t err = (call); \ if(cudasuccess != err) \ { \ fprintf(stderr,"cuda error:\nfile = %s\nline = %d\nreason = %s\n", __file__, __line__, cudageterrorstring(err)); \ cudadevicereset(); \ exit(exit_failure); \ } \ } \ while (0) #define cublascall(call) \ \ { \ cublasstatus_t status = (call); \ if(cublas_status_success != status) \ { \ fprintf(stderr,"cublas error:\nfile = %s\nline = %d\ncode = %d\n", __file__, __line__, status); \ cudadevicereset(); \ exit(exit_failure); \ } \ \ } \ while(0) __global__ void sgemm_kernel(float *a_d, float *i_d) { int m=10,n=20,k=20; int lda=k, ldb=k; cublashandle_t hdl; cublasstatus_t status = cublascreate_v2(&hdl); const float alpha=1.0f, beta=0.0f; status=cublassgemm(hdl,cublas_op_n,cublas_op_n,k,n,k,&alpha,a_d,lda,i_d,ldb,&beta,a_d,lda); } int main(int argc, char* argv[]) {float a[10*20]={-0.0614, -0.0199, 0.0024, -0.0414, 0.1736, -0.0595, -0.2794, 0.1946, -0.0647, -0.0025, -0.0036, 0.0628, -0.0827, 0.3679, -0.1913, 0.0500, -0.0245, 0.3855, -0.1298, -0.0334, -0.0241, -0.0564, 0.0098, -0.2862, -0.0474, 0.0333, -0.3049, 0.2851, -0.1242, 0.0162, 0.0241, 0.0270, -0.0670, 0.3129, -0.2428, 0.0947, -0.1878, 0.0889, -0.0208, 0.0075, -0.1559, 0.1437, -0.1916, 0.2297, -0.0833, -0.1805, 0.2522, -0.1738, 0.1027, -0.1273, 0.0716, 0.1882, -0.0963, 0.1081, 0.0958, -0.0713, 0.1931, 0.0874, -0.4186, 0.0345, -0.1912, 0.0501, -0.1396, -0.0989, -0.0338, 0.1773, 0.1088, 0.0389, -0.0117, 0.0014, 0.1648, -0.1705, -0.0575, -0.0133, -0.0570, 0.2124, -0.0193, 0.1535, 0.0857, -0.1308, 0.1971, 0.0882, -0.2577, 0.1662, -0.2498, -0.0365, -0.1805, 0.0921, 0.0912, 0.0178, -0.0379, 0.0080, 0.0572, -0.0067, 0.0591, -0.0136, 0.0471, -0.0163, 0.0082, -0.0338, -0.2436, 0.1116, 0.0732, -0.0319, 0.0550, 0.2821, 0.0240, 0.0109, -0.0034, 0.1212, -0.0061, 0.2497, -0.0542, -0.0939, 0.0651, 0.0063, -0.1367, 0.0580, 0.7389, -0.1143, -0.3786, 0.1288, 0.0001, 0.2604, -0.1094, -0.3624, -0.0184, 0.0538, 0.0329, 0.0040, 0.0603, 0.1422, 0.1037, -0.1846, 0.4046, -0.3738, -0.3487, 0.3846, -0.0849, 0.0135, -0.1850, 0.3571, -0.0543, -0.0025, -0.2880, 0.0600, 0.2605, -0.0474, 0.0010, -0.0333, -0.1974, 0.4788, -0.2441, 0.3847, -0.1235, -0.3503, -0.1785, -0.1095, 0.3158, 0.0062, -0.0509, -0.0502, 0.2154, 0.2237, -0.0671, 0.0377, 0.0519, 0.1530, -0.1675, 0.1856, -0.0380, -0.0026, 0.4700, 0.0097, -0.2394, 0.0717, -0.2101, 0.2841, -0.1799, -0.0924, -0.2678, 0.4485, 0.0044, 0.0030, -0.0439, 0.4337, 0.1819, -0.0180, -0.5443, 0.0864, 0.0390, -0.0235, -0.0706, 0.0138, 0.0633, -0.0147, 0.0444, -0.0334, 0.0557, 0.0507} float i[20*20]={1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; float *a_d, *i_d; cudacall(cudamalloc(&a_d,10*20*sizeof( float ))); cudacall(cudamalloc(&i_d, 20*20*sizeof(float ))); cudacall(cudamemcpy(a_d, a, 10*20*sizeof(float), cudamemcpyhosttodevice)); cudacall(cudamemcpy(i_d, i, 20*20*sizeof(float), cudamemcpyhosttodevice)); sgemm_kernel<<<1,1>>>(a_d, i_d); cudacall(cudadevicesynchronize()); cudacall(cudamemcpy(a, a_d, 10*20*sizeof(float), cudamemcpydevicetohost)); cudacall(cudafree(a_d)); cudacall(cudafree(i_d)); return 0; }
i couldn't correct result.
cublas doesn't support in-place operations (in fact no parallel blas aware of supports it). cannot pass a_d
, use in multiplication , matrix in operation. must use different memory allocation hold result.
so
c <- 1*(a * b) + 0*c
is legal, whereas
a <- 1*(a * b) + 0*a
is not.
Comments
Post a Comment