#include "system.h" // partially copied from and inspired by NVidia's framework, as used in e.g. // https://code.google.com/archive/p/understanding-the-efficiency-of-ray-traversal-on-gpus #define ARRAY_SIZE(X) ((int)(sizeof(X)/sizeof((X)[0]))) #define IMPORTS_H #define FW_DLL_IMPORT_CUDA(RET, CALL, NAME, PARAMS, PASS) static RET (CALL *s_ ## NAME)PARAMS = NULL; #define FW_DLL_IMPORT_CUV2(RET, CALL, NAME, PARAMS, PASS) static RET (CALL *s_ ## NAME)PARAMS = NULL; #define FW_DLL_IMPORT_CUV3(RET, CALL, NAME, PARAMS, PASS) static RET (CALL *s_ ## NAME)PARAMS = NULL; #include "cudatools.h" #undef FW_DLL_IMPORT_CUDA #undef FW_DLL_IMPORT_CUV2 #undef FW_DLL_IMPORT_CUV3 static bool dllInitialized = false; bool CUDAModule::initialized = false; CUdevice CUDAModule::device; CUcontext CUDAModule::context; int CUDAModule::computeCapability = -1; int CUDAModule::SMCount = 1; static struct { const char* name; HMODULE handle; } s_importDLLs[] = { { "nvcuda.dll", NULL } }; static const struct { const char* name; void** ptr; } s_importFuncs[] = { #define FW_DLL_IMPORT_CUDA( RET, CALL, NAME, PARAMS, PASS ) { #NAME, (void**)&s_ ## NAME }, #define FW_DLL_IMPORT_CUV2( RET, CALL, NAME, PARAMS, PASS ) { #NAME "_v2", (void**)&s_ ## NAME }, { #NAME, (void**)&s_ ## NAME }, #define FW_DLL_IMPORT_CUV3( RET, CALL, NAME, PARAMS, PASS ) { #NAME "_v3", (void**)&s_ ## NAME }, { #NAME, (void**)&s_ ## NAME }, #include "cudatools.h" #undef FW_DLL_IMPORT_CUDA #undef FW_DLL_IMPORT_CUV2 #undef FW_DLL_IMPORT_CUV3 }; // setfv // ---------------------------------------------------------------------------- static void setfv( string& s, const char* fmt, va_list args ) { static char* buffer = 0; if (!buffer) buffer = new char[16384]; int len = _vscprintf( fmt, args ); if (!len) return; vsprintf_s( buffer, len + 1, fmt, args ); s = buffer; } // fail // ---------------------------------------------------------------------------- static void fail( const char* fmt, ... ) { string tmp; va_list args; va_start( args, fmt ); setfv( tmp, fmt, args ); va_end( args ); printf( "\n%s\n", tmp.c_str() ); MessageBox( NULL, tmp.c_str(), "Fatal error", MB_OK ); exit( 0 ); } // initDLLImports // ---------------------------------------------------------------------------- static void initDLLImports() { if (dllInitialized) return; for (int i = 0; i < (int)ARRAY_SIZE( s_importDLLs ); i++) { s_importDLLs[i].handle = LoadLibrary( s_importDLLs[i].name ); if (s_importDLLs[i].handle) for (int j = 0; j < (int)ARRAY_SIZE( s_importFuncs ); j++) if (!*s_importFuncs[j].ptr) *s_importFuncs[j].ptr = (void*)GetProcAddress( s_importDLLs[i].handle, s_importFuncs[j].name ); } dllInitialized = true; } #define FW_DLL_IMPORT_CUDA(RET, CALL, NAME, PARAMS, PASS) RET CALL NAME PARAMS { if (!dllInitialized) initDLLImports(); if (!s_ ## NAME) fail("Failed to import " #NAME "()!"); return s_ ## NAME PASS; } #define FW_DLL_IMPORT_CUV2(RET, CALL, NAME, PARAMS, PASS) RET CALL NAME PARAMS { if (!dllInitialized) initDLLImports(); if (!s_ ## NAME) fail("Failed to import " #NAME "()!"); return s_ ## NAME PASS; } #define FW_DLL_IMPORT_CUV3(RET, CALL, NAME, PARAMS, PASS) RET CALL NAME PARAMS { if (!dllInitialized) initDLLImports(); if (!s_ ## NAME) fail("Failed to import " #NAME "()!"); return s_ ## NAME PASS; } #include "cudatools.h" #undef FW_DLL_IMPORT_CUDA #undef FW_DLL_IMPORT_CUV2 #undef FW_DLL_IMPORT_CUV3 #undef IMPORTS_H namespace cudalab { // getDriverVersion // ---------------------------------------------------------------------------- static int getDriverVersion() { int version = 2010; cuDriverGetVersion( &version ); version /= 10; return version / 10 + version % 10; } // tostring // ---------------------------------------------------------------------------- static string tostring( const char* fmt, ... ) { string str; va_list args; va_start( args, fmt ); setfv( str, fmt, args ); va_end( args ); return str; } // queryEnv // ---------------------------------------------------------------------------- static string queryEnv( const string& name ) { DWORD bufferSize = GetEnvironmentVariable( name.c_str(), NULL, 0 ); if (!bufferSize) return ""; char* buffer = new char[bufferSize]; buffer[0] = '\0'; GetEnvironmentVariable( name.c_str(), buffer, bufferSize ); string res = buffer; delete[] buffer; return res; } // splitPathList // ---------------------------------------------------------------------------- static void splitPathList( vector& res, const string& value ) { for (int startIdx = 0; startIdx < value.length(); ) { int endIdx = (int)value.find_first_of( ";", startIdx ); if (endIdx == -1) endIdx = (int)value.length(); string item = value.substr( startIdx, endIdx - startIdx ); if (item.length() >= 2 && item.at( 0 ) == '\"' && item.at( item.length() - 1 ) == '\"') item = item.substr( 1, item.length() - 2 ); res.push_back( item ); startIdx = endIdx + 1; } } // fileExists // ---------------------------------------------------------------------------- static bool fileExists( const string& name ) { return ((GetFileAttributes( name.c_str() ) & FILE_ATTRIBUTE_DIRECTORY) == 0); } // decodeError // ---------------------------------------------------------------------------- static const char* decodeError( CUresult res ) { static char error[128]; switch (res) { default: strcpy( error, "Unknown CUresult" ); break; case CUDA_SUCCESS: strcpy( error, "No error" ); break; case CUDA_ERROR_INVALID_VALUE: strcpy( error, "Invalid value" ); break; case CUDA_ERROR_OUT_OF_MEMORY: strcpy( error, "Out of memory" ); break; case CUDA_ERROR_NOT_INITIALIZED: strcpy( error, "Not initialized" ); break; case CUDA_ERROR_DEINITIALIZED: strcpy( error, "Deinitialized" ); break; case CUDA_ERROR_NO_DEVICE: strcpy( error, "No device" ); break; case CUDA_ERROR_INVALID_DEVICE: strcpy( error, "Invalid device" ); break; case CUDA_ERROR_INVALID_IMAGE: strcpy( error, "Invalid image" ); break; case CUDA_ERROR_INVALID_CONTEXT: strcpy( error, "Invalid context" ); break; case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: strcpy( error, "Context already current" ); break; case CUDA_ERROR_MAP_FAILED: strcpy( error, "Map failed" ); break; case CUDA_ERROR_UNMAP_FAILED: strcpy( error, "Unmap failed" ); break; case CUDA_ERROR_ARRAY_IS_MAPPED: strcpy( error, "Array is mapped" ); break; case CUDA_ERROR_ALREADY_MAPPED: strcpy( error, "Already mapped" ); break; case CUDA_ERROR_NO_BINARY_FOR_GPU: strcpy( error, "No binary for GPU" ); break; case CUDA_ERROR_ALREADY_ACQUIRED: strcpy( error, "Already acquired" ); break; case CUDA_ERROR_NOT_MAPPED: strcpy( error, "Not mapped" ); break; case CUDA_ERROR_INVALID_SOURCE: strcpy( error, "Invalid source" ); break; case CUDA_ERROR_FILE_NOT_FOUND: strcpy( error, "File not found" ); break; case CUDA_ERROR_INVALID_HANDLE: strcpy( error, "Invalid handle" ); break; case CUDA_ERROR_NOT_FOUND: strcpy( error, "Not found" ); break; case CUDA_ERROR_NOT_READY: strcpy( error, "Not ready" ); break; case CUDA_ERROR_LAUNCH_FAILED: strcpy( error, "Launch failed" ); break; case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: strcpy( error, "Launch out of resources" ); break; case CUDA_ERROR_LAUNCH_TIMEOUT: strcpy( error, "Launch timeout" ); break; case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: strcpy( error, "Launch incompatible texturing" ); break; case CUDA_ERROR_UNKNOWN: strcpy( error, "Unknown error" ); break; case CUDA_ERROR_PROFILER_DISABLED: strcpy( error, "Profiler disabled" ); break; case CUDA_ERROR_PROFILER_NOT_INITIALIZED: strcpy( error, "Profiler not initialized" ); break; case CUDA_ERROR_PROFILER_ALREADY_STARTED: strcpy( error, "Profiler already started" ); break; case CUDA_ERROR_PROFILER_ALREADY_STOPPED: strcpy( error, "Profiler already stopped" ); break; case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: strcpy( error, "Not mapped as array" ); break; case CUDA_ERROR_NOT_MAPPED_AS_POINTER: strcpy( error, "Not mapped as pointer" ); break; case CUDA_ERROR_ECC_UNCORRECTABLE: strcpy( error, "ECC uncorrectable" ); break; case CUDA_ERROR_UNSUPPORTED_LIMIT: strcpy( error, "Unsupported limit" ); break; case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: strcpy( error, "Context already in use" ); break; case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: strcpy( error, "Shared object symbol not found" ); break; case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: strcpy( error, "Shared object init failed" ); break; case CUDA_ERROR_OPERATING_SYSTEM: strcpy( error, "Operating system error" ); break; case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: strcpy( error, "Peer access already enabled" ); break; case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: strcpy( error, "Peer access not enabled" ); break; case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: strcpy( error, "Primary context active" ); break; case CUDA_ERROR_CONTEXT_IS_DESTROYED: strcpy( error, "Context is destroyed" ); break; case CUDA_ERROR_ILLEGAL_ADDRESS: strcpy( error, "Illegal address" ); break; case CUDA_ERROR_MISALIGNED_ADDRESS: strcpy( error, "Misaligned address" ); break; } return error; } // CheckError // ---------------------------------------------------------------------------- static void CheckError( const char* funcName, CUresult res ) { if (res != CUDA_SUCCESS) { fail( "%s() failed: %s!", funcName, decodeError( res ) ); } } // loggedError // ---------------------------------------------------------------------------- static void loggedError( char* description, char* logFile ) { string message = description; FILE* f = fopen( logFile, "r" ); char buffer[8192]; bool first = true; while (!feof( f )) { buffer[0] = 0; fgets( buffer, 8190, f ); if (!buffer[0]) continue; if (strstr( buffer, "error detected" )) break; if (!first) message += "\n"; first = false; message += buffer; } fail( "%s", message.c_str() ); } // selectDevice // ---------------------------------------------------------------------------- static CUdevice selectDevice() { int numDevices; CheckError( "cuDeviceGetCount", cuDeviceGetCount( &numDevices ) ); CUdevice device = NULL; __int64 bestScore = ((__int64)-1 << 63); for (int i = 0; i < numDevices; i++) { CUdevice dev; CheckError( "cuDeviceGet", cuDeviceGet( &dev, i ) ); // TODO: Use cuGLGetDevices() on CUDA 4.1+. int archMajor, archMinor; CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &archMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device ) ); CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &archMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device ) ); int arch = archMajor * 10 + archMinor; int clockRate, numProcessors; CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ) ); CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &numProcessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev ) ); __int64 score = 0; score += (__int64)arch << 48; score += (__int64)clockRate * numProcessors; if (score > bestScore) { device = dev; bestScore = score; } } if (bestScore == ((__int64)-1 << 63)) fail( "CudaModule: No appropriate CUDA device found." ); return device; } // printDeviceInfo // ---------------------------------------------------------------------------- static void printDeviceInfo( CUdevice device ) { static const struct { CUdevice_attribute attrib; const char* name; } attribs[] = { #define A21(ENUM, NAME) { CU_DEVICE_ATTRIBUTE_ ## ENUM, NAME }, #define A40(ENUM, NAME) A21(ENUM, NAME) A21( CLOCK_RATE, "Clock rate" ) A40( MEMORY_CLOCK_RATE, "Memory clock rate" ) A21( MULTIPROCESSOR_COUNT, "Number of SMs" ) A21( MAX_THREADS_PER_BLOCK, "Max threads per block" ) A40( MAX_THREADS_PER_MULTIPROCESSOR, "Max threads per SM" ) A21( REGISTERS_PER_BLOCK, "Max registers per block" ) A21( SHARED_MEMORY_PER_BLOCK, "Max shared mem per block" ) A21( TOTAL_CONSTANT_MEMORY, "Constant memory" ) A21( MAX_BLOCK_DIM_X, "Max blockDim.x" ) A21( MAX_GRID_DIM_X, "Max gridDim.x" ) A40( CONCURRENT_KERNELS, "Concurrent launches supported" ) A21( GPU_OVERLAP, "Concurrent memcopy supported" ) A40( ASYNC_ENGINE_COUNT, "Max concurrent memcopies" ) A40( UNIFIED_ADDRESSING, "Unified addressing supported" ) A40( CAN_MAP_HOST_MEMORY, "Can map host memory" ) #undef A21 #undef A40 }; char name[256]; int major, minor; size_t memory; CheckError( "cuDeviceGetName", cuDeviceGetName( name, ARRAY_SIZE( name ) - 1, device ) ); CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device ) ); CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device ) ); CheckError( "cuDeviceTotalMem", cuDeviceTotalMem( &memory, device ) ); name[ARRAY_SIZE( name ) - 1] = '\0'; printf( "\n" ); printf( "%-32s%s\n", tostring( "CUDA device %d", (int)device ).c_str(), name ); printf( "%-32s%s\n", "---", "---" ); printf( "%-32s%d.%d\n", "Compute capability", major, minor ); printf( "%-32s%.0f megs\n", "Total memory", (float)memory * exp2( -20 ) ); #if 1 for (int i = 0; i < (int)ARRAY_SIZE( attribs ); i++) { int value; if (cuDeviceGetAttribute( &value, attribs[i].attrib, device ) == CUDA_SUCCESS) printf( "%-32s%d\n", attribs[i].name, value ); } #endif printf( "\n" ); } } // namespace cudalab // CUDAModule::CheckError // ---------------------------------------------------------------------------- void CUDAModule::CheckError( const char* funcName, CUresult res ) { cudalab::CheckError( funcName, res ); } // CUDAModule::Error // ---------------------------------------------------------------------------- void CUDAModule::Error( const char* message ) { MessageBox( NULL, message, "Fatal error", MB_OK ); } // CUDAModule::InitCUDA // ---------------------------------------------------------------------------- void CUDAModule::InitCUDA() { CUresult res = cuInit( 0 ); if (res != CUDA_SUCCESS) { if (res != CUDA_ERROR_NO_DEVICE) CheckError( "cuInit", res ); return; } device = selectDevice(); printDeviceInfo( device ); uint flags = 0; flags |= CU_CTX_SCHED_SPIN; // use sync() if you want to yield flags |= CU_CTX_LMEM_RESIZE_TO_MAX; // reduce launch overhead with large localmem CheckError( "cuCtxCreate", cuCtxCreate( &context, flags, device ) ); int major, minor; CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device ) ); CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device ) ); computeCapability = major * 10 + minor; CheckError( "cuDeviceGetAttribute", cuDeviceGetAttribute( &SMCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device ) ); } // CUDAModule::CUDAModule // ---------------------------------------------------------------------------- CUDAModule::CUDAModule( char* source, int regs, char* dep1, char* dep2, char* dep3, char* dep4, char* dep5, char* dep6, char* dep7, char* dep8, char* dep9, char* depA, char* depB, char* depC, char* depD, char* depE, char* depF, char* depG, char* depH, char* depI, char* depJ, char* depK, char* depL ) { // initialize cuda if this is our first module if (!initialized) { InitCUDA(); initialized = true; printf( "loading CUDA kernels...\n" ); } // prepare cubin string cmd, cubin, ptx; int cc = computeCapability; char ccc[10]; sprintf( ccc, "%i", cc ); cubin = source; #ifdef _DEBUG cubin = cubin.substr( 0, cubin.find( ".cu" ) ) + "_" + ccc + "_debug.cubin"; ptx = cubin.substr( 0, cubin.find( ".cu" ) ) + "_" + ccc + "_debug.ptx"; #else cubin = cubin.substr( 0, cubin.find( ".cu" ) ) + "_" + ccc + ".cubin"; ptx = cubin.substr( 0, cubin.find( ".cu" ) ) + "_" + ccc + ".ptx"; #endif bool modified = false; vector modFiles; if (fileExists( source )) { if (FileIsNewer( source, cubin.c_str() )) modified = true, modFiles.push_back( source ); if (dep1) if (FileIsNewer( dep1, cubin.c_str() )) modified = true, modFiles.push_back( dep1 ); if (dep2) if (FileIsNewer( dep2, cubin.c_str() )) modified = true, modFiles.push_back( dep2 ); if (dep3) if (FileIsNewer( dep3, cubin.c_str() )) modified = true, modFiles.push_back( dep3 ); if (dep4) if (FileIsNewer( dep4, cubin.c_str() )) modified = true, modFiles.push_back( dep4 ); if (dep5) if (FileIsNewer( dep5, cubin.c_str() )) modified = true, modFiles.push_back( dep5 ); if (dep6) if (FileIsNewer( dep6, cubin.c_str() )) modified = true, modFiles.push_back( dep6 ); if (dep7) if (FileIsNewer( dep7, cubin.c_str() )) modified = true, modFiles.push_back( dep7 ); if (dep8) if (FileIsNewer( dep8, cubin.c_str() )) modified = true, modFiles.push_back( dep8 ); if (dep9) if (FileIsNewer( dep9, cubin.c_str() )) modified = true, modFiles.push_back( dep9 ); if (depA) if (FileIsNewer( depA, cubin.c_str() )) modified = true, modFiles.push_back( depA ); if (depB) if (FileIsNewer( depB, cubin.c_str() )) modified = true, modFiles.push_back( depB ); if (depC) if (FileIsNewer( depC, cubin.c_str() )) modified = true, modFiles.push_back( depC ); if (depD) if (FileIsNewer( depD, cubin.c_str() )) modified = true, modFiles.push_back( depD ); if (depE) if (FileIsNewer( depE, cubin.c_str() )) modified = true, modFiles.push_back( depE ); if (depF) if (FileIsNewer( depF, cubin.c_str() )) modified = true, modFiles.push_back( depF ); if (depG) if (FileIsNewer( depG, cubin.c_str() )) modified = true, modFiles.push_back( depG ); if (depH) if (FileIsNewer( depH, cubin.c_str() )) modified = true, modFiles.push_back( depH ); if (depI) if (FileIsNewer( depI, cubin.c_str() )) modified = true, modFiles.push_back( depI ); if (depJ) if (FileIsNewer( depJ, cubin.c_str() )) modified = true, modFiles.push_back( depJ ); if (depK) if (FileIsNewer( depK, cubin.c_str() )) modified = true, modFiles.push_back( depK ); if (depL) if (FileIsNewer( depL, cubin.c_str() )) modified = true, modFiles.push_back( depL ); } if (modified) { string modified = modFiles[0].substr( modFiles[0].rfind( '/' ) + 1, modFiles[0].length() - (modFiles[0].rfind( '/' ) + 1) ); for (int i = 0; i < modFiles.size(); i++) { string& file = modFiles[i]; modified += ", " + file.substr( file.rfind( '/' ) + 1, file.length() - (file.rfind( '/' ) + 1) ); }; printf( "%s module: %s modified; compiling... ", cubin.substr( cubin.rfind( '/' ) + 1, cubin.length() - (cubin.rfind( '/' ) + 1) ).c_str(), modified.c_str() ); char* p = source; while (strstr( p, "/" )) p = strstr( p, "/" ) + 1; while (strstr( p, "\\" )) p = strstr( p, "\\" ) + 1; char blFileName[256], elFileName[256]; strcpy( blFileName, "buildlog_" ); strcat( blFileName, p ); strcat( blFileName, ".txt" ); strcpy( elFileName, "errorlog_" ); strcat( elFileName, p ); strcat( elFileName, ".txt" ); int maxRegCount = regs; if (cc < 50) maxRegCount = min( 63, regs ); #ifdef _DEBUG #pragma message( "compiling gpu code with debug information." ) cmd = tostring( "nvcc -cubin -use_fast_math -I=\"lib\\CUDA\" -Xptxas=\"-v\" -lineinfo -maxrregcount=%i -arch=sm_%i -o %s %s 2>>%s 1>>%s", maxRegCount, cc, cubin.c_str(), source, elFileName, blFileName ); #else #pragma message( "compiling gpu code without debug information." ) cmd = tostring( "nvcc -cubin -use_fast_math -I=\"lib\\CUDA\" -restrict -Xptxas=\"-v\" -maxrregcount=%i -arch=sm_%i -o %s %s 2>>%s 1>>%s", maxRegCount, cc, cubin.c_str(), source, elFileName, blFileName ); #endif ::remove( cubin.c_str() ); ::remove( blFileName ); ::remove( elFileName ); int i; Timer t; t.reset(); if (((i = system( cmd.c_str() )) != 0) || (!fileExists( cubin ))) { char t[128]; sprintf( t, "CudaCompiler: Compilation failed! (%i)", i ); loggedError( t, elFileName ); } #if 0 // generate ptx output for inspection cmd = tostring( "nvcc -ptx -use_fast_math -I=\"lib\\CUDA\" -Xptxas=\"-v\" -maxrregcount=%i -arch=sm_61 -o %s %s 2>>%s 1>>%s", maxRegCount, ptx.c_str(), source, elFileName, blFileName ); system( cmd.c_str() ); #endif printf( "done (%5.2fs).\n", t.elapsed() * 0.001f ); } else { printf( "loading existing module %s\n", cubin.substr( cubin.rfind( '/' ) + 1, cubin.length() - (cubin.rfind( '/' ) + 1) ).c_str() ); } CheckError( "cuModuleLoad", cuModuleLoad( &module, cubin.c_str() ) ); globals = 0; texRefs = surfRefs = 0; resourcesMapped = false; } // CUDAModule::~CUDAModule // ---------------------------------------------------------------------------- CUDAModule::~CUDAModule() { if (context) CheckError( "cuCtxDestroy", cuCtxDestroy( context ) ); context = 0; device = 0; } // CUDAModule::GetGlobalID // ---------------------------------------------------------------------------- int CUDAModule::GetGlobalID( const char* name ) { for (int i = 0; i < globals; i++) if (!strcmp( globalVar[i].name, name )) return i; globalVar[globals].name = new char[strlen( name ) + 1]; strcpy( globalVar[globals].name, name ); CheckError( "cuModuleGetGlobal", cuModuleGetGlobal( &globalVar[globals].address, &globalVar[globals].size, module, name ) ); globals++; return globals - 1; } // CUDAModule::GetTexRefID // ---------------------------------------------------------------------------- int CUDAModule::GetTexRefID( const char* name ) { for (int i = 0; i < texRefs; i++) if (!strcmp( texRef[i].name, name )) return i; texRef[texRefs].name = new char[strlen( name ) + 1]; strcpy( texRef[texRefs].name, name ); CheckError( "cuModuleGetTexRef", cuModuleGetTexRef( &texRef[texRefs].ref, module, name ) ); texRefs++; return texRefs - 1; } // CUDAModule::LinkTexture // ---------------------------------------------------------------------------- void CUDAModule::LinkTexture( const char* name, GLTexture* texture ) { int idx = texRefs++; texRef[idx].name = new char[strlen( name ) + 1]; strcpy( texRef[idx].name, name ); texRef[idx].textureID = texture->GetID(); CUtexref ref; CUgraphicsResource res; CUarray_format format( CU_AD_FORMAT_UNSIGNED_INT8 ); CUaddress_mode addressMode = CU_TR_ADDRESS_MODE_CLAMP; CUfilter_mode filterMode = CU_TR_FILTER_MODE_LINEAR; CheckError( "cuModuleGetTexRef", cuModuleGetTexRef( &ref, module, name ) ); CheckError( "cuTexRefSetFormat", cuTexRefSetFormat( ref, format, 2 ) ); for (int dim = 0; dim < 3; dim++) CheckError( "cuTexRefSetAddressMode", cuTexRefSetAddressMode( ref, dim, addressMode ) ); CheckError( "cuTexRefSetFilterMode", cuTexRefSetFilterMode( ref, filterMode ) ); CheckError( "cuTexRefSetFlags", cuTexRefSetFlags( ref, CU_TRSF_NORMALIZED_COORDINATES ) ); CheckError( "cuGraphicsGLRegisterImage", cuGraphicsGLRegisterImage( &res, texture->GetID(), GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY ) ); texRef[idx].ref = ref; texRef[idx].res = res; texRef[idx].firstUse = true; } // CUDAModule::LinkSurfaceToTexture // ---------------------------------------------------------------------------- int CUDAModule::FindSurfRef( const char* name ) { for (int i = 0; i < surfRefs; i++) if (!strcmp( surfRef[i].name, name )) return i; return -1; } // CUDAModule::LinkSurfaceToTexture // ---------------------------------------------------------------------------- void CUDAModule::LinkSurfaceToTexture( const char* name, GLTexture* texture ) { int idx = FindSurfRef( name ); if (idx == -1) { idx = surfRefs++; surfRef[idx].name = new char[strlen( name ) + 1]; strcpy( surfRef[idx].name, name ); } surfRef[idx].textureID = texture->GetID(); CUsurfref ref; CUgraphicsResource res; CheckError( "cuModuleGetSurfRef", cuModuleGetSurfRef( &ref, module, name ) ); CheckError( "cuGraphicsGLRegisterImage", cuGraphicsGLRegisterImage( &res, texture->GetID(), GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST ) ); surfRef[idx].ref = ref; surfRef[idx].res = res; surfRef[idx].firstUse = true; } // CUDAModule::MapResources // ---------------------------------------------------------------------------- void CUDAModule::MapResources() { if (resourcesMapped) return; // map textures to surfaces for (int i = 0; i < surfRefs; i++) { if ((computeCapability < 30) || (surfRef[i].firstUse)) { CheckError( "cuGraphicsMapResources", cuGraphicsMapResources( 1, &surfRef[i].res, 0 ) ); CheckError( "cuGraphicsSubResourceGetMappedArray", cuGraphicsSubResourceGetMappedArray( &surfRef[i].ar, surfRef[i].res, 0, 0 ) ); CheckError( "cuSurfRefSetArray", cuSurfRefSetArray( surfRef[i].ref, surfRef[i].ar, 0 ) ); surfRef[i].firstUse = false; } } // map GL textures to CUDA textures for (int i = 0; i < texRefs; i++) { if ((computeCapability < 30) || (texRef[i].firstUse)) { CheckError( "cuGraphicsMapResources", cuGraphicsMapResources( 1, &texRef[i].res, 0 ) ); CheckError( "cuGraphicsSubResourceGetMappedArray", cuGraphicsSubResourceGetMappedArray( &texRef[i].ar, texRef[i].res, 0, 0 ) ); CheckError( "cuTexRefSetArray", cuTexRefSetArray( texRef[i].ref, texRef[i].ar, CU_TRSA_OVERRIDE_FORMAT ) ); texRef[i].firstUse = false; } } // mark resources as mapped resourcesMapped = true; } // CUDAModule::UnmapResources // ---------------------------------------------------------------------------- void CUDAModule::UnmapResources() { if (!resourcesMapped) return; if (computeCapability < 30) { for (int i = 0; i < surfRefs; i++) { CheckError( "cuGraphicsUnmapResources", cuGraphicsUnmapResources( 1, &surfRef[i].res, 0 ) ); } for (int i = 0; i < texRefs; i++) { CheckError( "cuGraphicsUnmapResources", cuGraphicsUnmapResources( 1, &texRef[i].res, 0 ) ); } } // mark resources as unmapped resourcesMapped = false; } // CUDAKernel::CUDAKernel // ---------------------------------------------------------------------------- CUDAKernel::CUDAKernel( CUDAModule* cudaModule, char* function ) { module = cudaModule; FindKernel( function ); if (!kernel) fail( "Module doees not contain kernel function %s.", function ); // initialize module state firstLaunch = true; paramsChanged = true; // allocate room for parameters paramBuffer = new unsigned char[MAXPARAMS * sizeof( GenericParameter )]; paramCount = 0; // set default block and grid dimensions blockDimX = 64, blockDimY = 1; gridDimX = 64, gridDimY = 1; // default to prefer L1 preferL1 = true; } // CUDAKernel::~CUDAKernel // ---------------------------------------------------------------------------- CUDAKernel::~CUDAKernel() { delete paramBuffer; } // CUDAKernel::FindKernel // ---------------------------------------------------------------------------- void CUDAKernel::FindKernel( const char* name ) { kernel = NULL; CheckError( "cuModuleGetFunction", cuModuleGetFunction( &kernel, module->GetModule(), name ) ); if (!kernel) { string altName( "__globfunc_" ); altName += name; CheckError( "cuModuleGetFunction", cuModuleGetFunction( &kernel, module->GetModule(), altName.c_str() ) ); } } // CUDAKernel::Launch // ---------------------------------------------------------------------------- void CUDAKernel::Launch() { // gather parameters if (paramsChanged) { paramSize = 0; for (int i = 0; i < paramCount; i++) { const int alignment = paramSize & (param[i].align - 1); if (alignment != 0) paramSize += param[i].align - alignment; memcpy( paramBuffer + paramSize, param[i].b, param[i].size ); paramSize += param[i].size; } paramsChanged = false; } // link textures and surfaces module->MapResources(); // prepare kernel for launch if (firstLaunch) { CheckError( "cuFuncSetCacheConfig", cuFuncSetCacheConfig( kernel, preferL1 ? CU_FUNC_CACHE_PREFER_L1 : CU_FUNC_CACHE_PREFER_SHARED ) ); CheckError( "cuFuncSetSharedMemConfig", cuFuncSetSharedMemConfig( kernel, (false) ? CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE : CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE ) ); firstLaunch = false; } // launch kernel uint blocksx = (gridDimX + (blockDimX - 1)) / blockDimX; if (blocksx > 0) { void* params[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, paramBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, ¶mSize, CU_LAUNCH_PARAM_END }; CheckError( "cuLaunchKernel", cuLaunchKernel( kernel, blocksx, gridDimY / blockDimY, 1, blockDimX, blockDimY, 1, 0, 0, NULL, params ) ); } // unmap textures and surfaces module->UnmapResources(); }