View Single Post
Old
  (#115)
VultureX
Banned
 
Videocard: MSI GTX970 SLI
Processor: Core i7 2700K @4.8GHz H2O
Mainboard: Asrock Z68 Extreme3 Gen3
Memory: 8GB G.Skill 2133MHz CL9
Soundcard: Xonar Essence ST - Z-5500
PSU: Corsair TX850 V2
Default 01-24-2015, 02:36 | posts: 2,577 | Location: Netherlands

Quote:
Originally Posted by Fox2232 View Post
So, this modification should create and bench on 64MB blocks:
This version should allocate exactly 3GB and no more. Or stop if there is not enough to allocate.
Try to run 3GB version, check if 3GB are taken and if so, launch some smaller game. If not, run one bench on check if it stays allocated after bench, since I am not sure if regions get freed.
I finally figured out the compile options so here is the requested functionality.
You can now specify the allocation block size and the maximum memory that is used as follows:

vRamBandwidthTest.exe [BlockSizeMB] [MaxAllocationMB]
- BlockSizeMB: any number of 16 32 64 128 256 512 1024
- MaxAllocationMB: any number greater or equal to BlockSizeMB

If no arguments are given the test runs the 128MB blocksize by default with no memory limit, which corresponds exactly with the old program.

Download here:
http://nl.guru3d.com/vRamBandWidthTest-guru3d.zip

Source:
Code:
#include "device_launch_parameters.h"
#include "helper_math.h"
#include <stdio.h>
#include <iostream>
#define CacheCount 5

__global__ void BenchMarkDRAMKernel(float4* In, int Float4Count)
{
	int ThreadID = (blockDim.x *blockIdx.x + threadIdx.x) % Float4Count;
 
	float4 Temp = make_float4(1);
 
	Temp += In[ThreadID];
	
 
	if (length(Temp) == -12354)
		In[0] = Temp;
 
} 
 
__global__ void BenchMarkCacheKernel(float4* In, int Zero,int Float4Count)
{
	int ThreadID = (blockDim.x *blockIdx.x + threadIdx.x) % Float4Count;
 
	float4 Temp = make_float4(1);
 
#pragma unroll
	for (int i = 0; i < CacheCount; i++)
	{
		Temp += In[ThreadID + i*Zero];
	}
 
	if (length(Temp) == -12354)
		In[0] = Temp;
 
}

int isPowerOfTwo (unsigned int x)
{
  return ((x != 0) && !(x & (x - 1)));
}
 
int main(int argc, char *argv[])
{
	printf("Nai's Benchmark, edited by VultureX \n");

	//Sanity checks and some device info:
	int nDevices;
	cudaGetDeviceCount(&nDevices);
	if(nDevices >= 1) {
		cudaDeviceProp prop;
		cudaGetDeviceProperties(&prop, 0);
		printf("  Device: %s (%1.2f GB)\n", prop.name, prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
		printf("  Memory Bus Width (bits): %d\n",
			   prop.memoryBusWidth);
		printf("  Peak Theoretical DRAM Bandwidth (GB/s): %f\n\n",
			   2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
	} else {
		printf("No CUDA capable devices were found!\n");
		printf("Press return to exit...\n");
		getchar();
		return 1;
	}
	
	//Get maximum amount of memory that should be allocated
	unsigned int MemLimitMB;
	if(argc < 3 || sscanf(argv[2], " %u", &MemLimitMB) != 1) {
		MemLimitMB = INT_MAX;
	}

	//Get block size in MB, default to 128
	unsigned int ChunkSizeMB = 0;
	if(argc >= 2) {
		sscanf(argv[1], " %u", &ChunkSizeMB);	
	}
	if(ChunkSizeMB < 16 || ChunkSizeMB > 1024 || !isPowerOfTwo(ChunkSizeMB)) {
		ChunkSizeMB = 128;
	}
	if(MemLimitMB < ChunkSizeMB) {
		MemLimitMB = ChunkSizeMB;
	}
	int ChunkSize = ChunkSizeMB * 1024 * 1024; //To Bytes
	int Float4Count = ChunkSize / sizeof(float4);
	
	//Allocate as many blocks as possible
	static const int PointerCount = 5000;
	float4* Pointers[PointerCount];
	int UsedPointers = 0;
	
	printf("Allocating Memory . . . \nChunk Size: %i MiByte  \n", ChunkSizeMB);	
	while (cudaGetLastError() == cudaSuccess
		&& (UsedPointers+1) * ChunkSizeMB <= MemLimitMB)
	{ 
		cudaMalloc(&Pointers[UsedPointers], ChunkSize); 
		if (cudaGetLastError() != cudaSuccess) {
			break;
		}

		cudaMemset(Pointers[UsedPointers], 0, ChunkSize);
		UsedPointers++;
	} 
 
	printf("Allocated %i Chunks \n", UsedPointers); 
	printf("Allocated %i MiByte \n", ChunkSizeMB*UsedPointers);
 
	//Benchmarks
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
 
	int BlockSize = 128;
	int BenchmarkCount = 30;
	int BlockCount = BenchmarkCount * Float4Count / BlockSize;
	
	printf("Benchmarking DRAM \n");
	
	for (int i = 0; i < UsedPointers; i++)
	{
		cudaEventRecord(start);
 
		BenchMarkDRAMKernel <<<BlockCount, BlockSize>>>(Pointers[i], Float4Count);
 
		cudaEventRecord(stop);
		cudaEventSynchronize(stop);
		
		// Check for any errors launching the kernel
		cudaError_t cudaStatus = cudaGetLastError();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
			continue;
		}
		float milliseconds = 0;
		cudaEventElapsedTime(&milliseconds, start, stop);
 
		float Bandwidth = ((float)(BenchmarkCount)* (float)(ChunkSize)) / milliseconds / 1000.f / 1000.f;
		printf("DRAM-Bandwidth of Chunk no. %i (%i MiByte to %i MiByte):%5.2f GByte/s \n", i, ChunkSizeMB*i, ChunkSizeMB*(i + 1), Bandwidth);
	} 
 
 
	printf("Benchmarking L2-Cache \n"); 
 
	for (int i = 0; i < UsedPointers; i++)
	{
		cudaEventRecord(start);
 
		BenchMarkCacheKernel <<<BlockCount, BlockSize>>>(Pointers[i], 0, Float4Count);

		cudaEventRecord(stop);
		cudaEventSynchronize(stop);
 
		// Check for any errors launching the kernel
		cudaError_t cudaStatus = cudaGetLastError();
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
			continue;
		}
		float milliseconds = 0;
		cudaEventElapsedTime(&milliseconds, start, stop);
 
		float Bandwidth = (((float)CacheCount* (float)BenchmarkCount * (float)ChunkSize)) / milliseconds / 1000.f / 1000.f;
		printf("L2-Cache-Bandwidth of Chunk no. %i (%i MiByte to %i MiByte):%5.2f GByte/s \n", i, ChunkSizeMB*i, ChunkSizeMB*(i + 1), Bandwidth);
	}
 
 
	system("pause");
 
	cudaDeviceSynchronize();
	cudaDeviceReset();
    return 0;
}
@Fox2232:
By the way "int BlockSize = 128;" has nothing to do with memory allocation and is best left at its current value. It actually denotes the number of threads per thread block of the gpu kernels.
The total amount of threads that is run is determined by BlockSize * BlockCount, so there will always be enough threads spawned to cover all of the memory.