![]() ![]() |
Nov 10 2009, 05:41 AM
Post
#1
|
|
![]() Group: Members Posts: 8 Joined: 3-November 09 Member No.: 243,810 Club SLI Member: No |
I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32.
Why it is not given a right result. CODE /************************************************************
******** * SquareMatrixMultification * This is a example of the CUDA program. ************************************************************ *********/ #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <cutil.h> #include<conio.h> __global__ void SquareMatrixMul(float *x,float *y,float *z,int width) { int i=blockIdx.y*blockDim.y+threadIdx.y; int j=blockIdx.x*blockDim.x+threadIdx.x; int k; float a,b; float sum=0; for(k=0;k<width;k++) { a=x[i*width+k]; b=y[k*width+j]; sum+=a*b; } z[i*width+j]=sum; } int main() { float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d; int width=3; int i; size_t size=sizeof(float)*(width*width); a_h=(float *)malloc(size); b_h=(float *)malloc(size); c_h=(float *)malloc(size); cudaMalloc((void **)&a_d,size); cudaMalloc((void **)&b_d,size); cudaMalloc((void **)&c_d,size); printf("\nEnter the Elements of First Matrix"); for(i=0;i<(width*width);i++) { scanf("%f",&a_h[i]); } printf("\nElements of First Matrix"); for(i=0;i<(width*width);i++) { printf("\n%f",a_h[i]); } cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice); printf("\nEnter the Elements of Second Matrix"); for(i=0;i<(width*width);i++) { scanf("%f",&b_h[i]); } printf("\nElements of Second Matrix"); for(i=0;i<(width*width);i++) { printf("\n%f",b_h[i]); } cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice); int blocksize=4; width=width*width; int nblock=width/blocksize+(width%blocksize==0?0:1); width=3; SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width); cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost); printf("\nMultification of SquareMatrics"); for(i=0;i<(width*width);i++) { printf("\n%f",c_h[i]); } free(a_h); free(b_h); free(c_h); cudaFree(a_d); cudaFree(b_d); cudaFree(c_d); getch(); return 0; }
Attached File(s)
|
|
|
|
Nov 10 2009, 05:50 AM
Post
#2
|
|
![]() ![]() ![]() ![]() ![]() ![]() Group: Members Posts: 261 Joined: 24-September 07 Member No.: 71,314 Org.: Tsing Hua university, R.O.C (Taiwan) |
you need to impose boundary condition
CODE __global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{ int i=blockIdx.y*blockDim.y+threadIdx.y; int j=blockIdx.x*blockDim.x+threadIdx.x; int k; float a,b; float sum=0; if ( (i < width) && ( j < width ) ){ for(k=0;k<width;k++) { a=x[i*width+k]; b=y[k*width+j]; sum+=a*b; } z[i*width+j]=sum; }// for valid (i,j) } -------------------- Department of Mathematics, Tsing Hua university, R.O.C.
Lung Sheng Chien |
|
|
|
Nov 10 2009, 06:34 AM
Post
#3
|
|
![]() Group: Members Posts: 8 Joined: 3-November 09 Member No.: 243,810 Club SLI Member: No |
you need to impose boundary condition CODE __global__ void SquareMatrixMul(float *x,float *y,float *z,int width) { int i=blockIdx.y*blockDim.y+threadIdx.y; int j=blockIdx.x*blockDim.x+threadIdx.x; int k; float a,b; float sum=0; if ( (i < width) && ( j < width ) ){ for(k=0;k<width;k++) { a=x[i*width+k]; b=y[k*width+j]; sum+=a*b; } z[i*width+j]=sum; }// for valid (i,j) } Respected Sir, Thanks to quick reply.I change my code according your suggestion.But this it is given Wrong Result(snapshot of output).Why it is not given right result . CODE #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <cutil.h> #include<conio.h> __global__ void SquareMatrixMul(float *x,float *y,float *z,int width) { int i=blockIdx.y*blockDim.y+threadIdx.y; int j=blockIdx.x*blockDim.x+threadIdx.x; int k; float a,b; float sum=0; if((i<width)&&(j<width)) { for(k=0;k<width;k++) { a=x[i*width+k]; b=y[k*width+j]; sum+=a*b; } z[i*width+j]=sum; } } int main() { float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d; int width=3; int i; size_t size=sizeof(float)*(width*width); a_h=(float *)malloc(size); b_h=(float *)malloc(size); c_h=(float *)malloc(size); cudaMalloc((void **)&a_d,size); cudaMalloc((void **)&b_d,size); cudaMalloc((void **)&c_d,size); printf("\nEnter the Elements of First Matrix"); for(i=0;i<(width*width);i++) { scanf("%f",&a_h[i]); } printf("\nElements of First Matrix"); for(i=0;i<(width*width);i++) { printf("\n%f",a_h[i]); } cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice); printf("\nEnter the Elements of Second Matrix"); for(i=0;i<(width*width);i++) { scanf("%f",&b_h[i]); } printf("\nElements of Second Matrix"); for(i=0;i<(width*width);i++) { printf("\n%f",b_h[i]); } cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice); int blocksize=4; width=width*width; int nblock=width/blocksize+(width%blocksize==0?0:1); width=3; SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width); cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost); printf("\nMultification of SquareMatrics"); for(i=0;i<(width*width);i++) { printf("\n%f",c_h[i]); } free(a_h); free(b_h); free(c_h); cudaFree(a_d); cudaFree(b_d); cudaFree(c_d); getch(); return 0; } Thanking you Deepak Bajaj
Attached File(s)
|
|
|
|
Nov 10 2009, 06:44 AM
Post
#4
|
|
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Group: Members Posts: 1,567 Joined: 23-November 07 From: Bangalore Member No.: 79,873 Org.: HCL Technologies |
Bajaj,
1. Attach JPG files. They have far lesser size than BMP 2. You can address people by "First Name". "Respected Sir" is not normal in intenet forums. 3. The best debugger lies in between your ears. -------------------- Ignorance Rules; Knowledge Liberates!
|
|
|
|
Nov 10 2009, 09:24 AM
Post
#5
|
|
![]() ![]() ![]() ![]() ![]() ![]() Group: Members Posts: 261 Joined: 24-September 07 Member No.: 71,314 Org.: Tsing Hua university, R.O.C (Taiwan) |
you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel. modify your code CODE int blocksize=4; width=width*width; int nblock=width/blocksize+(width%blocksize==0?0:1); width=3; SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width); to 2-D threads block and 2-D grid block as CODE dim3 blocksize( 2, 2 );
dim3 nblock( (width+1)/2, (width+1)/2 ); SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width); -------------------- Department of Mathematics, Tsing Hua university, R.O.C.
Lung Sheng Chien |
|
|
|
Nov 10 2009, 10:42 AM
Post
#6
|
|
![]() Group: Members Posts: 8 Joined: 3-November 09 Member No.: 243,810 Club SLI Member: No |
I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32. Why it is not given a right result. CODE /************************************************************ ******** * SquareMatrixMultification * This is a example of the CUDA program. ************************************************************ *********/ #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <cutil.h> #include<conio.h> __global__ void SquareMatrixMul(float *x,float *y,float *z,int width) { int i=blockIdx.y*blockDim.y+threadIdx.y; int j=blockIdx.x*blockDim.x+threadIdx.x; int k; float a,b; float sum=0; for(k=0;k<width;k++) { a=x[i*width+k]; b=y[k*width+j]; sum+=a*b; } z[i*width+j]=sum; } int main() { float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d; int width=3; int i; size_t size=sizeof(float)*(width*width); a_h=(float *)malloc(size); b_h=(float *)malloc(size); c_h=(float *)malloc(size); cudaMalloc((void **)&a_d,size); cudaMalloc((void **)&b_d,size); cudaMalloc((void **)&c_d,size); printf("\nEnter the Elements of First Matrix"); for(i=0;i<(width*width);i++) { scanf("%f",&a_h[i]); } printf("\nElements of First Matrix"); for(i=0;i<(width*width);i++) { printf("\n%f",a_h[i]); } cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice); printf("\nEnter the Elements of Second Matrix"); for(i=0;i<(width*width);i++) { scanf("%f",&b_h[i]); } printf("\nElements of Second Matrix"); for(i=0;i<(width*width);i++) { printf("\n%f",b_h[i]); } cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice); int blocksize=4; width=width*width; int nblock=width/blocksize+(width%blocksize==0?0:1); width=3; SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width); cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost); printf("\nMultification of SquareMatrics"); for(i=0;i<(width*width);i++) { printf("\n%f",c_h[i]); } free(a_h); free(b_h); free(c_h); cudaFree(a_d); cudaFree(b_d); cudaFree(c_d); getch(); return 0; } |
|
|
|
Nov 10 2009, 10:46 AM
Post
#7
|
|
![]() Group: Members Posts: 8 Joined: 3-November 09 Member No.: 243,810 Club SLI Member: No |
you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel. modify your code CODE int blocksize=4; width=width*width; int nblock=width/blocksize+(width%blocksize==0?0:1); width=3; SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width); to 2-D threads block and 2-D grid block as CODE dim3 blocksize( 2, 2 ); dim3 nblock( (width+1)/2, (width+1)/2 ); SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width); Thanks LSChien . |
|
|
|
![]() ![]() |
| Copyright 2008 NVIDIA Corporation. Terms of Use | Legal Info | Privacy Policy | Time is now: 23rd November 2009 - 02:02 PM |