Aug 3 2007, 11:33 PM
Post
#1
|
|
![]() Group: Members Posts: 6 Joined: 27-June 07 Member No.: 59,015 |
EDIT: GeForce 8800 GTS, GeForce 8800 GTX, GeForce 8800 Ultra, and Quadro FX5600 results are now posted below.
Although not a G8x owner myself (yet!), I am very interested to know how quickly an 8800GTX could perform 1D FFTs with 128K elements, now that the 16K limit has been removed. For the benefit of all, I've written the attached benchmarking tool and invite anyone with an 8800-series card to run it and post your results. CODE #define WIN32_LEAN_AND_MEAN #include <windows.h> #include <cufft.h> #include <cutil.h> #define MIN_NX 1024 #define MAX_NX 262144 #define MIN_BATCH 1 #define MAX_BATCH 64 #define FFTS_PER_TEST 32 int main(int argc, char **argv) { // force the program to run on a single processor DWORD processAffinityMask; DWORD systemAffinityMask; GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask); if (processAffinityMask!=1) { SetProcessAffinityMask(GetCurrentProcess(), 1); system(argv[0]); return 0; } // introduce the program printf("\nCUFFT BENCHMARKING TOOL v1.0\n\n"); printf("This program evaluates the utility of using CUDA devices as\n"); printf("FFT coprocessors for digital signal processing applications.\n"); printf("Each table entry is an estimate of the maximum number of FFTs\n"); printf("that can be performed per second, considering both the time\n"); printf("needed to calculate the FFTs and the time needed to copy data\n"); printf("to and from the CUDA device.\n\n"); printf("To minimize interference from the OS and other programs, each\n"); printf("estimate is based on the fastest of %i identical calculations.\n", FFTS_PER_TEST); // perform CUDA device initialization CUT_DEVICE_INIT(); // display CUDA device info int deviceCount; CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount)); for (int dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev)); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); printf(" Major revision number: %d\n", deviceProp.major); printf(" Minor revision number: %d\n", deviceProp.minor); printf(" Total amount of global memory: %d bytes\n", deviceProp.totalGlobalMem); printf(" Clock rate: %d kilohertz\n", deviceProp.clockRate); } // initialize host PC arrays int hostArrayLength = MAX_NX * MAX_BATCH; cufftComplex *hostArrayA = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex)); cufftComplex *hostArrayB = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex)); float *element = (float *)hostArrayA; float *elementLimit = (float *)(hostArrayA+hostArrayLength); while (element<elementLimit) *element++ = rand(); // run timing tests for in-place and out-of-place FFTs for (int out_of_place=0; out_of_place<2; ++out_of_place) { // print table headings printf("\n"); printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("---------", batch); printf("\n"); if (out_of_place) printf("1D Complex-to-Complex Out-of-Place FFTs\n"); else printf("1D Complex-to-Complex In-Place FFTs\n"); printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("---------", batch); printf("\n"); printf(" nx | batch\n"); printf(" "); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n"); printf(" "); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("|%8i", batch); printf("\n"); printf("--------"); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n"); // run timing tests for a variety of FFT array lengths for (int nx=MIN_NX; nx<MAX_NX+1; nx*=2) { printf("%8i", nx); // run timing tests for a variety of batch settings for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) { // generate CUFFT plan cufftHandle plan; CUFFT_SAFE_CALL(cufftPlan1d(&plan, nx, CUFFT_C2C, batch)); // allocate arrays on host PC and CUDA device, fill host array with random data size_t arraySize = sizeof(cufftComplex) * nx * batch; cufftComplex *deviceArrayA; cufftComplex *deviceArrayB; CUDA_SAFE_CALL(cudaMalloc((void**)&deviceArrayA, arraySize)); if (out_of_place) CUDA_SAFE_CALL(cudaMalloc((void**)&deviceArrayB, arraySize)); // run a series of identical timing tests, looking for the fastest one (the one with the least OS interference) int fastestRateFound = 0; char *spinner = "|/-\\"; int spindex = 0; for (int rep=0; rep<FFTS_PER_TEST; ++rep) { __int64 startCount; __int64 stopCount; __int64 countsPerSec; QueryPerformanceFrequency((LARGE_INTEGER *)&countsPerSec); if (out_of_place) { QueryPerformanceCounter((LARGE_INTEGER *)&startCount); CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice)); CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayB, CUFFT_FORWARD)); CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayB, arraySize, cudaMemcpyDeviceToHost)); QueryPerformanceCounter((LARGE_INTEGER *)&stopCount); } else { QueryPerformanceCounter((LARGE_INTEGER *)&startCount); CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice)); CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayA, CUFFT_FORWARD)); CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayA, arraySize, cudaMemcpyDeviceToHost)); QueryPerformanceCounter((LARGE_INTEGER *)&stopCount); } int fftsPerSec = (int)( (countsPerSec*batch) / (stopCount-startCount) ); if (fastestRateFound<fftsPerSec) fastestRateFound = fftsPerSec; // provide some entertainment printf("%c\b", spinner[spindex++]); if (spindex==4) spindex = 0; } printf("|%8i", fastestRateFound); CUFFT_SAFE_CALL(cufftDestroy(plan)); CUDA_SAFE_CALL(cudaFree(deviceArrayA)); if (out_of_place) CUDA_SAFE_CALL(cudaFree(deviceArrayB)); } printf("\n"); } printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n"); } free(hostArrayA); free(hostArrayB); printf("\nPress ENTER to exit...\n"); fflush( stdout); fflush( stderr); getchar(); exit(EXIT_SUCCESS); } This post has been edited by cclark: Sep 5 2007, 04:06 PM |
|
|
|
cclark CUFFT BENCHMARKING TOOL v1.0 Aug 3 2007, 11:33 PM
mfatica I did a quick conversion to Linux, I am not 100% s... Aug 4 2007, 03:12 AM
StrikerBlitz QUOTE(mfatica @ Aug 3 2007, 08:12 PM)I did a ... Aug 22 2007, 08:53 PM
cclark Since originating this thread and reading mfatica... Aug 13 2007, 10:56 PM
cclark Here are the GeForce 8800GTX benchmark results, ag... Aug 17 2007, 07:49 PM
jimh QUOTE(cclark @ Aug 17 2007, 01:49 PM)
Finally... Aug 30 2007, 04:47 PM
cmorrison I would presume it's the memory bandwidth of t... Aug 31 2007, 09:15 AM
amjoao QUOTE(jimh @ Aug 30 2007, 01:47 PM)What am I ... Sep 12 2007, 02:32 PM
SrJsignal CClark
What are your bandwidthTest scores on the ... Aug 17 2007, 08:47 PM
cclark QUOTE(SrJsignal @ Aug 17 2007, 04:47 PM)What ... Aug 17 2007, 10:12 PM
mfatica These are the bandwidth numbers using the example ... Aug 18 2007, 01:07 AM
StrikerBlitz [quote=cclark,Aug 3 2007, 04:33 PM]
-------------... Aug 22 2007, 09:06 PM
cclark QUOTE(StrikerBlitz @ Aug 22 2007, 05:06 PM)QU... Aug 22 2007, 10:36 PM
cmorrison According to the CUDA manual the 8800 is capable o... Aug 28 2007, 03:18 PM
mfatica This is the a version of the code with the timer f... Aug 28 2007, 06:07 PM
cmorrison Is that code not doing the exact same thing as ccc... Aug 29 2007, 09:16 AM
BonsaiScott QUOTE(cmorrison @ Aug 29 2007, 05:16 AM)For r... Nov 5 2007, 04:02 PM
RobinB QUOTE(BonsaiScott @ Nov 5 2007, 05:02 PM)Thei... Nov 26 2007, 03:13 PM
pedro.leite Unpinned Results:
CODEDevice 0: "GeForce ... Dec 10 2007, 06:52 PM
BonsaiScott QUOTE(RobinB @ Nov 26 2007, 11:13 AM)Hi Scott... Jan 2 2008, 07:33 PM
RobinB QUOTE(BonsaiScott @ Jan 2 2008, 08:33 PM)Not ... Jan 22 2008, 04:44 PM
mfatica Sorry, I posted the code for StrikerBlitz.
The cod... Aug 29 2007, 02:14 PM
cmorrison QUOTE(mfatica @ Aug 29 2007, 03:14 PM)Sorry, ... Aug 29 2007, 02:55 PM
mfatica The 52Gflops are on the device with no I/O, using ... Aug 29 2007, 03:02 PM
cmorrison QUOTE(mfatica @ Aug 29 2007, 04:02 PM)The 52G... Aug 29 2007, 03:35 PM
seb I've got results for the GeForce 8800 Ultra. I... Sep 5 2007, 02:39 PM
cclark seb,
Thanks for testing the Ultra. It looks like... Sep 5 2007, 04:32 PM
jimh Can everyone please post the motherboard and chips... Sep 12 2007, 04:31 PM
username3 Board: Asus P5W64 WS Professional | Chipset: Intel... Nov 22 2007, 05:10 PM
sgwood Hi all,
I'm new to everything here and have no... Jan 1 2008, 12:44 AM
halz This is with a Zotac 8800GT 'Amp'-- core i... Feb 5 2008, 07:08 AM
tprox Just started exploring CUDA, but wanted to add a t... Feb 19 2008, 04:41 PM
jimh QUOTE(tprox @ Feb 19 2008, 09:41 AM)I also ge... Feb 19 2008, 06:33 PM
KenH Working the best numbers from above (cclark, post ... Feb 20 2008, 06:38 PM
vpodlozhnyuk QUOTE(KenH @ Feb 20 2008, 09:38 PM)Working th... Feb 21 2008, 10:23 AM
KenH My understanding is that the Cell benchmarks above... Feb 21 2008, 03:06 PM
skb I've done some benchmarking on a GTS8600 using... Mar 18 2008, 06:56 PM
skb It looks like the setup for Device to Host transfe... Mar 18 2008, 08:52 PM
wickwack Has anyone had a chance to experiment with larger ... Mar 19 2008, 10:16 PM
CUDA_RM Hello. I'm new to CUDA and i'm asking you ... Apr 21 2008, 10:42 AM
XFer My bench (Cuda 1.1, G92 GTS 8800 512, Core2 Quad @... Jul 5 2008, 09:54 AM
XFer With pinned memory:
CODE
Device 0: "GeFor... Jul 7 2008, 04:51 PM
joanisaac Without pinned memory for a 8800GTX...
Device 0:... Jul 11 2008, 06:17 PM
XFer QUOTE(joanisaac @ Jul 11 2008, 08:17 PM)Does ... Jul 14 2008, 08:31 AM![]() ![]() |
| Copyright 2008 NVIDIA Corporation. Terms of Use | Legal Info | Privacy Policy | Time is now: 9th February 2010 - 11:45 PM |