IPB

Welcome Guest ( Log In | Register )

 
Reply to this topicStart new topic
> Take Garbage Value, At the place of Result
Dbajaj
post Nov 10 2009, 05:41 AM
Post #1



*

Group: Members
Posts: 8
Joined: 3-November 09
Member No.: 243,810
Club SLI Member: No



I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32.
Why it is not given a right result.
CODE
/************************************************************
********
*  SquareMatrixMultification
*  This is a example of the CUDA program.
************************************************************
*********/

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include<conio.h>
__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{
        int i=blockIdx.y*blockDim.y+threadIdx.y;
        int j=blockIdx.x*blockDim.x+threadIdx.x;
        int k;
        float a,b;
        float sum=0;
        for(k=0;k<width;k++)
        {
             a=x[i*width+k];
            
             b=y[k*width+j];
            
            sum+=a*b;
        }
        
        z[i*width+j]=sum;
}
int main()
{
            float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;
            int width=3;
            int i;
            size_t size=sizeof(float)*(width*width);
            a_h=(float *)malloc(size);
            b_h=(float *)malloc(size);
            c_h=(float *)malloc(size);
            cudaMalloc((void **)&a_d,size);
            cudaMalloc((void **)&b_d,size);
            cudaMalloc((void **)&c_d,size);
            printf("\nEnter the Elements of First Matrix");
            for(i=0;i<(width*width);i++)
            {
                 scanf("%f",&a_h[i]);
            }
            
            printf("\nElements of First Matrix");
            for(i=0;i<(width*width);i++)
            {
                printf("\n%f",a_h[i]);
            }
            cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
            printf("\nEnter the Elements of Second Matrix");
            for(i=0;i<(width*width);i++)

            {
                 scanf("%f",&b_h[i]);
            }
            printf("\nElements of Second Matrix");
            for(i=0;i<(width*width);i++)
            {
                printf("\n%f",b_h[i]);
            }
            cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
            int blocksize=4;
            width=width*width;
            int nblock=width/blocksize+(width%blocksize==0?0:1);
            width=3;
            SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);
            cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
            printf("\nMultification of SquareMatrics");
            for(i=0;i<(width*width);i++)
            {
                 printf("\n%f",c_h[i]);
            }
            free(a_h);
            free(b_h);
            free(c_h);
            cudaFree(a_d);
            cudaFree(b_d);
            cudaFree(c_d);
            getch();
            return 0;
}

Attached File(s)
Attached File  SquareMatrixMultification.JPG ( 122.61K ) Number of downloads: 3
 
Go to the top of the page
 
+Quote Post
LSChien
post Nov 10 2009, 05:50 AM
Post #2



******

Group: Members
Posts: 261
Joined: 24-September 07
Member No.: 71,314
Org.: Tsing Hua university, R.O.C (Taiwan)



you need to impose boundary condition
CODE
__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{
        int i=blockIdx.y*blockDim.y+threadIdx.y;
        int j=blockIdx.x*blockDim.x+threadIdx.x;
        int k;
        float a,b;
        float sum=0;
        if ( (i < width) && ( j < width ) ){
            for(k=0;k<width;k++)
            {
             a=x[i*width+k];
            
             b=y[k*width+j];
            
            sum+=a*b;
            }
        
            z[i*width+j]=sum;
        }// for valid (i,j)
}


--------------------
Department of Mathematics, Tsing Hua university, R.O.C.
Lung Sheng Chien
Go to the top of the page
 
+Quote Post
Dbajaj
post Nov 10 2009, 06:34 AM
Post #3



*

Group: Members
Posts: 8
Joined: 3-November 09
Member No.: 243,810
Club SLI Member: No



QUOTE (LSChien @ Nov 10 2009, 11:20 AM) *
you need to impose boundary condition
CODE
__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{
        int i=blockIdx.y*blockDim.y+threadIdx.y;
        int j=blockIdx.x*blockDim.x+threadIdx.x;
        int k;
        float a,b;
        float sum=0;
        if ( (i < width) && ( j < width ) ){
            for(k=0;k<width;k++)
            {
             a=x[i*width+k];
            
             b=y[k*width+j];
            
            sum+=a*b;
            }
        
            z[i*width+j]=sum;
        }// for valid (i,j)
}

Respected Sir,
Thanks to quick reply.I change my code according your suggestion.But this it is given Wrong Result(snapshot of output).Why it is not given right result .

CODE
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include<conio.h>
__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{
        int i=blockIdx.y*blockDim.y+threadIdx.y;
        int j=blockIdx.x*blockDim.x+threadIdx.x;
        int k;
        float a,b;
        float sum=0;
        if((i<width)&&(j<width))
        {
        for(k=0;k<width;k++)
        {
             a=x[i*width+k];
            
             b=y[k*width+j];
            
            sum+=a*b;
        }
        
        z[i*width+j]=sum;
        }
}
int main()
{
            float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;
            int width=3;
            int i;
            size_t size=sizeof(float)*(width*width);
            a_h=(float *)malloc(size);
            b_h=(float *)malloc(size);
            c_h=(float *)malloc(size);
            cudaMalloc((void **)&a_d,size);
            cudaMalloc((void **)&b_d,size);
            cudaMalloc((void **)&c_d,size);
            printf("\nEnter the Elements of First Matrix");
            for(i=0;i<(width*width);i++)
            {
                 scanf("%f",&a_h[i]);
            }
            
            printf("\nElements of First Matrix");
            for(i=0;i<(width*width);i++)
            {
                printf("\n%f",a_h[i]);
            }
            cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
            printf("\nEnter the Elements of Second Matrix");
            for(i=0;i<(width*width);i++)

            {
                 scanf("%f",&b_h[i]);
            }
            printf("\nElements of Second Matrix");
            for(i=0;i<(width*width);i++)
            {
                printf("\n%f",b_h[i]);
            }
            cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
            int blocksize=4;
            width=width*width;
            int nblock=width/blocksize+(width%blocksize==0?0:1);
            width=3;
            SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);
            cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
            printf("\nMultification of SquareMatrics");
            for(i=0;i<(width*width);i++)
            {
                 printf("\n%f",c_h[i]);
            }
            free(a_h);
            free(b_h);
            free(c_h);
            cudaFree(a_d);
            cudaFree(b_d);
            cudaFree(c_d);
            getch();
            return 0;
}

Thanking you
Deepak Bajaj
Attached File(s)
 
Go to the top of the page
 
+Quote Post
Sarnath
post Nov 10 2009, 06:44 AM
Post #4



********

Group: Members
Posts: 1,567
Joined: 23-November 07
From: Bangalore
Member No.: 79,873
Org.: HCL Technologies



Bajaj,

1. Attach JPG files. They have far lesser size than BMP
2. You can address people by "First Name". "Respected Sir" is not normal in intenet forums.
3. The best debugger lies in between your ears.


--------------------
Ignorance Rules; Knowledge Liberates!
Go to the top of the page
 
+Quote Post
LSChien
post Nov 10 2009, 09:24 AM
Post #5



******

Group: Members
Posts: 261
Joined: 24-September 07
Member No.: 71,314
Org.: Tsing Hua university, R.O.C (Taiwan)




you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel.

modify your code
CODE
int blocksize=4;
width=width*width;
int nblock=width/blocksize+(width%blocksize==0?0:1);
width=3;
SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);


to 2-D threads block and 2-D grid block as
CODE
    dim3 blocksize( 2, 2 );
    dim3 nblock( (width+1)/2, (width+1)/2 );
    SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);


--------------------
Department of Mathematics, Tsing Hua university, R.O.C.
Lung Sheng Chien
Go to the top of the page
 
+Quote Post
Dbajaj
post Nov 10 2009, 10:42 AM
Post #6



*

Group: Members
Posts: 8
Joined: 3-November 09
Member No.: 243,810
Club SLI Member: No



QUOTE (Dbajaj @ Nov 10 2009, 11:11 AM) *
I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32.
Why it is not given a right result.
CODE
/************************************************************
********
*  SquareMatrixMultification
*  This is a example of the CUDA program.
************************************************************
*********/

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include<conio.h>
__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{
        int i=blockIdx.y*blockDim.y+threadIdx.y;
        int j=blockIdx.x*blockDim.x+threadIdx.x;
        int k;
        float a,b;
        float sum=0;
        for(k=0;k<width;k++)
        {
             a=x[i*width+k];
            
             b=y[k*width+j];
            
            sum+=a*b;
        }
        
        z[i*width+j]=sum;
}
int main()
{
            float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;
            int width=3;
            int i;
            size_t size=sizeof(float)*(width*width);
            a_h=(float *)malloc(size);
            b_h=(float *)malloc(size);
            c_h=(float *)malloc(size);
            cudaMalloc((void **)&a_d,size);
            cudaMalloc((void **)&b_d,size);
            cudaMalloc((void **)&c_d,size);
            printf("\nEnter the Elements of First Matrix");
            for(i=0;i<(width*width);i++)
            {
                 scanf("%f",&a_h[i]);
            }
            
            printf("\nElements of First Matrix");
            for(i=0;i<(width*width);i++)
            {
                printf("\n%f",a_h[i]);
            }
            cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
            printf("\nEnter the Elements of Second Matrix");
            for(i=0;i<(width*width);i++)

            {
                 scanf("%f",&b_h[i]);
            }
            printf("\nElements of Second Matrix");
            for(i=0;i<(width*width);i++)
            {
                printf("\n%f",b_h[i]);
            }
            cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
            int blocksize=4;
            width=width*width;
            int nblock=width/blocksize+(width%blocksize==0?0:1);
            width=3;
            SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);
            cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
            printf("\nMultification of SquareMatrics");
            for(i=0;i<(width*width);i++)
            {
                 printf("\n%f",c_h[i]);
            }
            free(a_h);
            free(b_h);
            free(c_h);
            cudaFree(a_d);
            cudaFree(b_d);
            cudaFree(c_d);
            getch();
            return 0;
}

Go to the top of the page
 
+Quote Post
Dbajaj
post Nov 10 2009, 10:46 AM
Post #7



*

Group: Members
Posts: 8
Joined: 3-November 09
Member No.: 243,810
Club SLI Member: No



QUOTE (LSChien @ Nov 10 2009, 02:54 PM) *
you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel.

modify your code
CODE
int blocksize=4;
width=width*width;
int nblock=width/blocksize+(width%blocksize==0?0:1);
width=3;
SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);


to 2-D threads block and 2-D grid block as
CODE
    dim3 blocksize( 2, 2 );
    dim3 nblock( (width+1)/2, (width+1)/2 );
    SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

Thanks LSChien .
Go to the top of the page
 
+Quote Post

Reply to this topicStart new topic

 



Copyright 2008 NVIDIA Corporation.  Terms of Use | Legal Info | Privacy Policy Time is now: 23rd November 2009 - 02:02 PM
Unites States Argentina Brazil Chile China Colombia France Germany India Italy Japan Korea Mexico Poland Russia Spain Taiwan United Kingdom Venezuela