#ifdef IMPORTS_H //------------------------------------------------------------------------ // CUDA Driver API //------------------------------------------------------------------------ // CUDA 2.1 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuInit, (unsigned int Flags), (Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceGet, (CUdevice* device, int ordinal), (device, ordinal) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceGetCount, (int* count), (count) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceGetName, (char* name, int len, CUdevice dev), (name, len, dev) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceComputeCapability, (int* major, int* minor, CUdevice dev), (major, minor, dev) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceGetProperties, (CUdevprop* prop, CUdevice dev), (prop, dev) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceGetAttribute, (int* pi, CUdevice_attribute attrib, CUdevice dev), (pi, attrib, dev) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuCtxDestroy, (CUcontext ctx), (ctx) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxAttach, (CUcontext* pctx, unsigned int flags), (pctx, flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxDetach, (CUcontext ctx), (ctx) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuCtxPushCurrent, (CUcontext ctx), (ctx) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuCtxPopCurrent, (CUcontext* pctx), (pctx) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxGetDevice, (CUdevice* device), (device) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxSynchronize, (void), () ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleLoad, (CUmodule* module, const char* fname), (module, fname) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleLoadData, (CUmodule* module, const void* image), (module, image) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleLoadDataEx, (CUmodule* module, const void* image, unsigned int numOptions, CUjit_option* options, void** optionValues), (module, image, numOptions, options, optionValues) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleLoadFatBinary, (CUmodule* module, const void* fatCubin), (module, fatCubin) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleUnload, (CUmodule hmod), (hmod) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleGetFunction, (CUfunction* hfunc, CUmodule hmod, const char* name), (hfunc, hmod, name) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleGetTexRef, (CUtexref* pTexRef, CUmodule hmod, const char* name), (pTexRef, hmod, name) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemFreeHost, (void* p), (p) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuFuncSetBlockShape, (CUfunction hfunc, int x, int y, int z), (hfunc, x, y, z) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuFuncSetSharedSize, (CUfunction hfunc, unsigned int bytes), (hfunc, bytes) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuArrayDestroy, (CUarray hArray), (hArray) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefCreate, (CUtexref* pTexRef), (pTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefDestroy, (CUtexref hTexRef), (hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetArray, (CUtexref hTexRef, CUarray hArray, unsigned int Flags), (hTexRef, hArray, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetFormat, (CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents), (hTexRef, fmt, NumPackedComponents) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetAddressMode, (CUtexref hTexRef, int dim, CUaddress_mode am), (hTexRef, dim, am) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetFilterMode, (CUtexref hTexRef, CUfilter_mode fm), (hTexRef, fm) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetFlags, (CUtexref hTexRef, unsigned int Flags), (hTexRef, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetArray, (CUarray* phArray, CUtexref hTexRef), (phArray, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetAddressMode, (CUaddress_mode* pam, CUtexref hTexRef, int dim), (pam, hTexRef, dim) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetFilterMode, (CUfilter_mode* pfm, CUtexref hTexRef), (pfm, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetFormat, (CUarray_format* pFormat, int* pNumChannels, CUtexref hTexRef), (pFormat, pNumChannels, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetFlags, (unsigned int* pFlags, CUtexref hTexRef), (pFlags, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuParamSetSize, (CUfunction hfunc, unsigned int numbytes), (hfunc, numbytes) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuParamSeti, (CUfunction hfunc, int offset, unsigned int value), (hfunc, offset, value) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuParamSetf, (CUfunction hfunc, int offset, float value), (hfunc, offset, value) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuParamSetv, (CUfunction hfunc, int offset, void* ptr, unsigned int numbytes), (hfunc, offset, ptr, numbytes) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuParamSetTexRef, (CUfunction hfunc, int texunit, CUtexref hTexRef), (hfunc, texunit, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuLaunch, (CUfunction f), (f) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuLaunchGrid, (CUfunction f, int grid_width, int grid_height), (f, grid_width, grid_height) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuLaunchGridAsync, (CUfunction f, int grid_width, int grid_height, CUstream hStream), (f, grid_width, grid_height, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuEventCreate, (CUevent* phEvent, unsigned int Flags), (phEvent, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuEventRecord, (CUevent hEvent, CUstream hStream), (hEvent, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuEventQuery, (CUevent hEvent), (hEvent) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuEventSynchronize, (CUevent hEvent), (hEvent) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuEventDestroy, (CUevent hEvent), (hEvent) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuEventElapsedTime, (float* pMilliseconds, CUevent hStart, CUevent hEnd), (pMilliseconds, hStart, hEnd) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuStreamCreate, (CUstream* phStream, unsigned int Flags), (phStream, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuStreamQuery, (CUstream hStream), (hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuStreamSynchronize, (CUstream hStream), (hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuStreamDestroy, (CUstream hStream), (hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGLInit, (void), () ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGLRegisterBufferObject, (GLuint bufferobj), (bufferobj) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGLUnmapBufferObject, (GLuint bufferobj), (bufferobj) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGLUnregisterBufferObject, (GLuint bufferobj), (bufferobj) ) // CUDA 2.2 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDriverGetVersion, (int* driverVersion), (driverVersion) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemHostAlloc, (void** pp, size_t bytesize, unsigned int Flags), (pp, bytesize, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuFuncGetAttribute, (int* pi, CUfunction_attribute attrib, CUfunction hfunc), (pi, attrib, hfunc) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuWGLGetDevice, (CUdevice* pDevice, HGPUNV hGpu), (pDevice, hGpu) ) // CUDA 2.3 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemHostGetFlags, (unsigned int* pFlags, void* p), (pFlags, p) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGLSetBufferObjectMapFlags, (GLuint buffer, unsigned int Flags), (buffer, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGLUnmapBufferObjectAsync, (GLuint buffer, CUstream hStream), (buffer, hStream) ) // CUDA 3.0 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuFuncSetCacheConfig, (CUfunction hfunc, CUfunc_cache config), (hfunc, config) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGraphicsUnregisterResource, (CUgraphicsResource resource), (resource) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGraphicsSubResourceGetMappedArray, (CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel), (pArray, resource, arrayIndex, mipLevel) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuGraphicsResourceSetMapFlags, (CUgraphicsResource resource, unsigned int flags), (resource, flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGraphicsMapResources, (unsigned int count, CUgraphicsResource* resources, CUstream hStream), (count, resources, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGraphicsUnmapResources, (unsigned int count, CUgraphicsResource* resources, CUstream hStream), (count, resources, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGetExportTable, (const void** ppExportTable, const CUuuid* pExportTableId), (ppExportTable, pExportTableId) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGraphicsGLRegisterBuffer, (CUgraphicsResource* pCudaResource, GLuint buffer, unsigned int Flags), (pCudaResource, buffer, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGraphicsGLRegisterImage, (CUgraphicsResource* pCudaResource, GLuint image, GLenum target, unsigned int Flags), (pCudaResource, image, target, Flags) ) // CUDA 3.1 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuModuleGetSurfRef, (CUsurfref* pSurfRef, CUmodule hmod, const char* name), (pSurfRef, hmod, name) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuSurfRefSetArray, (CUsurfref hSurfRef, CUarray hArray, unsigned int Flags), (hSurfRef, hArray, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuSurfRefGetArray, (CUarray* phArray, CUsurfref hSurfRef), (phArray, hSurfRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxSetLimit, (CUlimit limit, size_t value), (limit, value) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxGetLimit, (size_t* pvalue, CUlimit limit), (pvalue, limit) ) // CUDA 3.2 FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuDeviceTotalMem, (size_t* bytes, CUdevice dev), (bytes, dev) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuCtxCreate, (CUcontext* pctx, unsigned int flags, CUdevice dev), (pctx, flags, dev) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuModuleGetGlobal, (CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name), (dptr, bytes, hmod, name) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemGetInfo, (size_t* free, size_t* total), (free, total) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemAlloc, (CUdeviceptr* dptr, size_t bytesize), (dptr, bytesize) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemAllocPitch, (CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes), (dptr, pPitch, WidthInBytes, Height, ElementSizeBytes) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemFree, (CUdeviceptr dptr), (dptr) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemGetAddressRange, (CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr), (pbase, psize, dptr) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemAllocHost, (void** pp, size_t bytesize), (pp, bytesize) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyHtoD, (CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount), (dstDevice, srcHost, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyDtoH, (void* dstHost, CUdeviceptr srcDevice, size_t ByteCount), (dstHost, srcDevice, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyDtoD, (CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount), (dstDevice, srcDevice, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyDtoA, (CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount), (dstArray, dstOffset, srcDevice, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyAtoD, (CUdeviceptr dstDevice, CUarray hSrc, size_t srcOffset, size_t ByteCount), (dstDevice, hSrc, srcOffset, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyHtoA, (CUarray dstArray, size_t dstOffset, const void* pSrc, size_t ByteCount), (dstArray, dstOffset, pSrc, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyAtoH, (void* dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount), (dstHost, srcArray, srcOffset, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyAtoA, (CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount), (dstArray, dstOffset, srcArray, srcOffset, ByteCount) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyHtoAAsync, (CUarray dstArray, size_t dstOffset, const void* pSrc, size_t ByteCount, CUstream hStream), (dstArray, dstOffset, pSrc, ByteCount, hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyAtoHAsync, (void* dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream), (dstHost, srcArray, srcOffset, ByteCount, hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpy2D, (const CUDA_MEMCPY2D* pCopy), (pCopy) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpy2DUnaligned, (const CUDA_MEMCPY2D* pCopy), (pCopy) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpy3D, (const CUDA_MEMCPY3D* pCopy), (pCopy) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyHtoDAsync, (CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream), (dstDevice, srcHost, ByteCount, hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyDtoHAsync, (void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream), (dstHost, srcDevice, ByteCount, hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpy2DAsync, (const CUDA_MEMCPY2D* pCopy, CUstream hStream), (pCopy, hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpy3DAsync, (const CUDA_MEMCPY3D* pCopy, CUstream hStream), (pCopy, hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemsetD8, (CUdeviceptr dstDevice, unsigned char uc, size_t N), (dstDevice, uc, N) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemsetD16, (CUdeviceptr dstDevice, unsigned short us, size_t N), (dstDevice, us, N) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemsetD32, (CUdeviceptr dstDevice, unsigned int ui, size_t N), (dstDevice, ui, N) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemsetD2D8, (CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height), (dstDevice, dstPitch, uc, Width, Height) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemsetD2D16, (CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height), (dstDevice, dstPitch, us, Width, Height) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemsetD2D32, (CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height), (dstDevice, dstPitch, ui, Width, Height) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuArrayCreate, (CUarray* pHandle, const CUDA_ARRAY_DESCRIPTOR* pAllocateArray), (pHandle, pAllocateArray) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuArrayGetDescriptor, (CUDA_ARRAY_DESCRIPTOR* pArrayDescriptor, CUarray hArray), (pArrayDescriptor, hArray) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuArray3DCreate, (CUarray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pAllocateArray), (pHandle, pAllocateArray) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuArray3DGetDescriptor, (CUDA_ARRAY3D_DESCRIPTOR* pArrayDescriptor, CUarray hArray), (pArrayDescriptor, hArray) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuTexRefSetAddress, (size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes), (ByteOffset, hTexRef, dptr, bytes) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuTexRefGetAddress, (CUdeviceptr* pdptr, CUtexref hTexRef), (pdptr, hTexRef) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuGLCtxCreate, (CUcontext* pCtx, unsigned int Flags, CUdevice device), (pCtx, Flags, device) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuGLMapBufferObject, (CUdeviceptr* dptr, size_t* size, GLuint bufferobj), (dptr, size, bufferobj) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemHostGetDevicePointer, (CUdeviceptr* pdptr, void* p, unsigned int Flags), (pdptr, p, Flags) ) FW_DLL_IMPORT_CUV3( CUresult, CUDAAPI, cuTexRefSetAddress2D, (CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR* desc, CUdeviceptr dptr, size_t Pitch), (hTexRef, desc, dptr, Pitch) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuGLMapBufferObjectAsync, (CUdeviceptr* dptr, size_t* size, GLuint buffer, CUstream hStream), (dptr, size, buffer, hStream) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemcpyDtoDAsync, (CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream), (dstDevice, srcDevice, ByteCount, hStream) ) FW_DLL_IMPORT_CUV3( CUresult, CUDAAPI, cuGraphicsResourceGetMappedPointer, (CUdeviceptr* pDevPtr, size_t* pSize, CUgraphicsResource resource), (pDevPtr, pSize, resource) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxGetCacheConfig, (CUfunc_cache* pconfig), (pconfig) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxSetCacheConfig, (CUfunc_cache config), (config) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxGetApiVersion, (CUcontext ctx, unsigned int* version), (ctx, version) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemsetD8Async, (CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream), (dstDevice, uc, N, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemsetD16Async, (CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream), (dstDevice, us, N, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemsetD32Async, (CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream), (dstDevice, ui, N, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemsetD2D8Async, (CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream), (dstDevice, dstPitch, uc, Width, Height, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemsetD2D16Async, (CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream), (dstDevice, dstPitch, us, Width, Height, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemsetD2D32Async, (CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream), (dstDevice, dstPitch, ui, Width, Height, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuStreamWaitEvent, (CUstream hStream, CUevent hEvent, unsigned int Flags), (hStream, hEvent, Flags) ) // CUDA 4.0 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxSetCurrent, (CUcontext ctx), (ctx) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxGetCurrent, (CUcontext* pctx), (pctx) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuMemHostRegister, (void* p, size_t bytesize, unsigned int Flags), (p, bytesize, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemHostUnregister, (void* p), (p) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemcpy, (CUdeviceptr dst, CUdeviceptr src, size_t ByteCount), (dst, src, ByteCount) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemcpyPeer, (CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount), (dstDevice, dstContext, srcDevice, srcContext, ByteCount) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemcpy3DPeer, (const CUDA_MEMCPY3D_PEER* pCopy), (pCopy) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemcpyAsync, (CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream), (dst, src, ByteCount, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemcpyPeerAsync, (CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream), (dstDevice, dstContext, srcDevice, srcContext, ByteCount, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMemcpy3DPeerAsync, (const CUDA_MEMCPY3D_PEER* pCopy, CUstream hStream), (pCopy, hStream) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuPointerGetAttribute, (void* data, CUpointer_attribute attribute, CUdeviceptr ptr), (data, attribute, ptr) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuLaunchKernel, (CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra), (f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceCanAccessPeer, (int* canAccessPeer, CUdevice dev, CUdevice peerDev), (canAccessPeer, dev, peerDev) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxEnablePeerAccess, (CUcontext peerContext, unsigned int Flags), (peerContext, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxDisablePeerAccess, (CUcontext peerContext), (peerContext) ) // CUDA 4.1 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceGetByPCIBusId, (CUdevice* dev, char* pciBusId), (dev, pciBusId) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuDeviceGetPCIBusId, (char* pciBusId, int len, CUdevice dev), (pciBusId, len, dev) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuIpcGetEventHandle, (CUipcEventHandle* pHandle, CUevent event), (pHandle, event) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuIpcOpenEventHandle, (CUevent* phEvent, CUipcEventHandle handle), (phEvent, handle) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuIpcGetMemHandle, (CUipcMemHandle* pHandle, CUdeviceptr dptr), (pHandle, dptr) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuIpcOpenMemHandle, (CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags), (pdptr, handle, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuIpcCloseMemHandle, (CUdeviceptr dptr), (dptr) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGLGetDevices, (unsigned int* pCudaDeviceCount, CUdevice* pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList), (pCudaDeviceCount, pCudaDevices, cudaDeviceCount, deviceList) ) // CUDA 4.2 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxGetSharedMemConfig, (CUsharedconfig *pConfig), (pConfig) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuCtxSetSharedMemConfig, (CUsharedconfig config), (config) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuFuncSetSharedMemConfig, (CUfunction hfunc, CUsharedconfig config), (hfunc, config) ) // CUDA 5.0 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMipmappedArrayCreate, (CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc, unsigned int numMipmapLevels), (pHandle, pMipmappedArrayDesc, numMipmapLevels) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMipmappedArrayGetLevel, (CUarray* pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level), (pLevelArray, hMipmappedArray, level) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuMipmappedArrayDestroy, (CUmipmappedArray hMipmappedArray), (hMipmappedArray) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuStreamAddCallback, (CUstream hStream, CUstreamCallback callback, void* userData, unsigned int flags), (hStream, callback, userData, flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetMipmappedArray, (CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags), (hTexRef, hMipmappedArray, Flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetMipmapFilterMode, (CUtexref hTexRef, CUfilter_mode fm), (hTexRef, fm) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetMipmapLevelBias, (CUtexref hTexRef, float bias), (hTexRef, bias) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetMipmapLevelClamp, (CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp), (hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefSetMaxAnisotropy, (CUtexref hTexRef, unsigned int maxAniso), (hTexRef, maxAniso) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetMipmappedArray, (CUmipmappedArray* phMipmappedArray, CUtexref hTexRef), (phMipmappedArray, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetMipmapFilterMode, (CUfilter_mode* pfm, CUtexref hTexRef), (pfm, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetMipmapLevelBias, (float* pbias, CUtexref hTexRef), (pbias, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetMipmapLevelClamp, (float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp, CUtexref hTexRef), (pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexRefGetMaxAnisotropy, (int* pmaxAniso, CUtexref hTexRef), (pmaxAniso, hTexRef) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexObjectCreate, (CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc), (pTexObject, pResDesc, pTexDesc, pResViewDesc) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexObjectDestroy, (CUtexObject texObject), (texObject) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexObjectGetResourceDesc, (CUDA_RESOURCE_DESC* pResDesc, CUtexObject texObject), (pResDesc, texObject) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexObjectGetTextureDesc, (CUDA_TEXTURE_DESC* pTexDesc, CUtexObject texObject), (pTexDesc, texObject) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuTexObjectGetResourceViewDesc, (CUDA_RESOURCE_VIEW_DESC* pResViewDesc, CUtexObject texObject), (pResViewDesc, texObject) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuSurfObjectCreate, (CUsurfObject* pSurfObject, const CUDA_RESOURCE_DESC* pResDesc), (pSurfObject, pResDesc) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuSurfObjectDestroy, (CUsurfObject surfObject), (surfObject) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuSurfObjectGetResourceDesc, (CUDA_RESOURCE_DESC* pResDesc, CUsurfObject surfObject), (pResDesc, surfObject) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuGraphicsResourceGetMappedMipmappedArray, (CUmipmappedArray* pMipmappedArray, CUgraphicsResource resource), (pMipmappedArray, resource) ) // CUDA 5.5 FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuLinkCreate, (unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut), (numOptions, options, optionValues, stateOut) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuLinkAddData, (CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues), (state, type, data, size, name, numOptions, options, optionValues) ) FW_DLL_IMPORT_CUV2( CUresult, CUDAAPI, cuLinkAddFile, (CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues), (state, type, path, numOptions, options, optionValues) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuLinkComplete, (CUlinkState state, void **cubinOut, size_t *sizeOut), (state, cubinOut, sizeOut) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuLinkDestroy, (CUlinkState state), (state) ) // CUDA 6.0 FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuOccupancyMaxActiveBlocksPerMultiprocessor, (int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize), (numBlocks, func, blockSize, dynamicSMemSize) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, (int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags), (numBlocks, func, blockSize, dynamicSMemSize, flags) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuOccupancyMaxPotentialBlockSize, (int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit), (minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit) ) FW_DLL_IMPORT_CUDA( CUresult, CUDAAPI, cuOccupancyMaxPotentialBlockSizeWithFlags, (int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags), (minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit, flags) ) #else namespace cudalab { #define MAXPARAMS 32 #define MAXGLOBALS 16 #define MAXTEXREFS 16 #define MAXSURFREFS 16 enum { NOT_ALLOCATED = 0, ON_HOST = 1, ON_DEVICE = 2 }; class CUDAModule; template class CUDABuffer { public: CUDABuffer() : devicePtr( 0 ), hostPtr( 0 ), location( NOT_ALLOCATED ), owner( 0 ) {} CUDABuffer( __int64 elements, __int64 loc, void* source = 0 ) : devicePtr( 0 ), hostPtr( 0 ), location( loc ), owner( 0 ) { Construct( elements, source ); } ~CUDABuffer() { Destruct(); } void Resize( __int64 elements, void* source = 0 ) { Destruct(); Construct( elements, source ? source : hostPtr ); CopyToDevice(); for (CUtexref texRef : texRefs) SetTexRef( texRef ); } void BindToTexRef( CUDAModule* module, const char* refName, CUarray_format format = CU_AD_FORMAT_FLOAT, int components = 4 ) { cuDeviceGetAttribute( &maxTexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH, module->device ); CUtexref texRef; cuModuleGetTexRef( &texRef, module->GetModule(), refName ); texRefs.insert( texRef ); CUDAModule::CheckError( "cuTexRefSetAddressMode", cuTexRefSetAddressMode( texRef, 0, CU_TR_ADDRESS_MODE_WRAP ) ); CUDAModule::CheckError( "cuTexRefSetFormat", cuTexRefSetFormat( texRef, format, components ) ); CUDAModule::CheckError( "cuTexRefSetFilterMode", cuTexRefSetFilterMode( texRef, CU_TR_FILTER_MODE_POINT ) ); SetTexRef( texRef ); } void SetTexRef( CUtexref texRef ) { // Error checking if (sizeInBytes == 0) return; if (!devicePtr) FatalError( __FILE__, __LINE__, "Data not yet on device; can't bind to texref" ); CUarray_format format; int components; CUDAModule::CheckError( "cuTexRefGetFormat", cuTexRefGetFormat( &format, &components, texRef ) ); if (maxTexWidth < sizeInBytes / (format * components)) FatalError( __FILE__, __LINE__, "Texref exceeds maximum size!" ); //cuTexRefSetAddress will fail // Set the address and size size_t offset; CUDAModule::CheckError( "cuTexRefSetAddress", cuTexRefSetAddress( &offset, texRef, devicePtr, sizeInBytes ) ); } CUdeviceptr CopyToDevice() { if (sizeInBytes > 0) { if (!(location & ON_DEVICE)) { CUDAModule::CheckError( "cuMemAlloc", cuMemAlloc( &devicePtr, sizeInBytes ) ); location |= ON_DEVICE; owner |= ON_DEVICE; } CUDAModule::CheckError( "cuMemcpyHtoD", cuMemcpyHtoD( devicePtr, hostPtr, sizeInBytes ) ); } return devicePtr; } T* CopyToHost() { if (sizeInBytes > 0) { if (!(location & ON_HOST)) { hostPtr = (T*)_aligned_malloc( sizeInBytes, 64 ); location |= ON_HOST; owner |= ON_HOST; } CUDAModule::CheckError( "cuMemcpyDtoH", cuMemcpyDtoH( hostPtr, devicePtr, sizeInBytes ) ); } return hostPtr; } void Clear( int location ) { if (sizeInBytes > 0) { if (location & ON_HOST) memset( hostPtr, 0, sizeInBytes ); if (location & ON_DEVICE) CUDAModule::CheckError( "cuMemset", cuMemsetD32( devicePtr, 0, sizeInBytes / 4 ) ); } } CUdeviceptr& GetDevicePtr() { return devicePtr; } T*& GetHostPtr() { return hostPtr; } __int64 GetSizeInBytes() { return sizeInBytes; } __int64 GetSize() { return numElements; } private: void Construct( __int64 elements, void* source ) { numElements = elements; sizeInBytes = elements * sizeof( T ); if (elements > 0) { if (location & ON_HOST) { if (source) { hostPtr = (T*)source; } else { hostPtr = (T*)_aligned_malloc( sizeInBytes, 64 ); owner |= ON_HOST; } } if (location & ON_DEVICE) { CUDAModule::CheckError( "cuMemAlloc", cuMemAlloc( &devicePtr, sizeInBytes ) ); owner |= ON_DEVICE; } } } void Destruct() { if (sizeInBytes > 0) { if (owner & ON_HOST) { _aligned_free( hostPtr ); hostPtr = 0; owner &= ~ON_HOST; } if (owner & ON_DEVICE) { CUDAModule::CheckError( "cuMemFree", cuMemFree( devicePtr ) ); owner &= ~ON_DEVICE; } } sizeInBytes = 0; } CUdeviceptr devicePtr; T* hostPtr; __int64 location, owner, sizeInBytes, numElements; int maxTexWidth; unordered_set texRefs; }; class GLTexture; class CUDAModule { template friend class CUDABuffer; struct Global { CUdeviceptr address; size_t size; char* name; }; struct TexRef { CUtexref ref; CUgraphicsResource res; char* name; CUarray ar; int textureID; bool firstUse; }; struct SurfRef { CUsurfref ref; CUgraphicsResource res; char* name; CUarray ar; int textureID; bool firstUse; }; public: CUDAModule( char* source, int regs = 63, char* dep1 = 0, char* dep2 = 0, char* dep3 = 0, char* dep4 = 0, char* dep5 = 0, char* dep6 = 0, char* dep7 = 0, char* dep8 = 0, char* dep9 = 0, char* depA = 0, char* depB = 0, char* depC = 0, char* depD = 0, char* depE = 0, char* depF = 0, char* depG = 0, char* depH = 0, char* depI = 0, char* depJ = 0, char* depK = 0, char* depL = 0 ); ~CUDAModule(); void InitCUDA(); CUmodule GetModule() { return module; } int GetComputeCapability() { return computeCapability; } template void SetGlobal( int id, const T& value ) { CheckError( "cuMemcpyHtoD", cuMemcpyHtoD( globalVar[id].address, &value, globalVar[id].size ) ); } void SetGlobal( int id, const void* data, int size ) { CheckError( "cuMemcpyHtoD", cuMemcpyHtoD( globalVar[id].address, data, size ) ); } int GetGlobalID( const char* name ); int GetTexRefID( const char* name ); int GetSurfRefID( const char* name ); void MapResources(); void UnmapResources(); void LinkTexture( const char* name, GLTexture* texture ); int FindSurfRef( const char* name ); void LinkSurfaceToTexture( const char* name, GLTexture* texture ); static void CheckError( const char* funcName, CUresult res ); static void Error( const char* message ); static CUcontext& GetContext() { return context; } private: CUmodule module; Global globalVar[MAXGLOBALS]; TexRef texRef[MAXTEXREFS]; SurfRef surfRef[MAXSURFREFS]; int globals, texRefs, surfRefs; bool resourcesMapped; static bool initialized; static CUdevice device; static CUcontext context; static int computeCapability; public: static int SMCount; }; class CUDAKernel { struct GenericParameter { unsigned int b[64]; // space for mat4x4 and all smaller types int size, align; GenericParameter() {}; template GenericParameter( CUDABuffer& buffer ) { size = sizeof( CUdeviceptr ); align = __alignof(CUdeviceptr); *(CUdeviceptr*)b = buffer ? buffer.GetDevicePtr() : 0; } template GenericParameter( CUDABuffer* buffer ) { size = sizeof( CUdeviceptr ); align = __alignof(CUdeviceptr); *(CUdeviceptr*)b = buffer ? buffer->GetDevicePtr() : 0; } template GenericParameter( const T& v ) { size = sizeof( T ); align = __alignof(T); *(T*)b = v; } }; typedef const GenericParameter& P; public: CUDAKernel( CUDAModule* cudaModule, char* function ); ~CUDAKernel(); const CUfunction& GetKernel() const { return kernel; } void SetPreferL1( bool flag ) { preferL1 = flag; } template void SetArgument( int idx, const T& value ) { param[idx].size = sizeof( T ); *(T*)param[idx].b = value; paramCount = max( paramCount, idx + 1 ); paramsChanged = true; } template void SetArgument( int idx, CUDABuffer& buffer ) { param[idx].size = sizeof( CUdeviceptr ); *(CUdeviceptr*)param[idx].b = buffer.GetDevicePtr(); paramCount = max( paramCount, idx + 1 ); paramsChanged = true; } template void SetArgument( int idx, CUDABuffer* buffer ) { param[idx].size = sizeof( CUdeviceptr ); *(CUdeviceptr*)param[idx].b = buffer->GetDevicePtr(); paramCount = max( paramCount, idx + 1 ); paramsChanged = true; } void SetArguments( P p0 ) { F( 1 ); A( 0, p0 ); } void SetArguments( P p0, P p1 ) { F( 2 ); A( 0, p0 ); A( 1, p1 ); } void SetArguments( P p0, P p1, P p2 ) { F( 3 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); } void SetArguments( P p0, P p1, P p2, P p3 ) { F( 4 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); } void SetArguments( P p0, P p1, P p2, P p3, P p4 ) { F( 5 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5 ) { F( 6 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6 ) { F( 7 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7 ) { F( 8 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8 ) { F( 9 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9 ) { F( 10 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); A( 9, p9 ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA ) { F( 11 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); A( 9, p9 ); A( 10, pA ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB ) { F( 12 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); A( 9, p9 ); A( 10, pA ); A( 11, pB ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC ) { F( 13 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); A( 9, p9 ); A( 10, pA ); A( 11, pB ); A( 12, pC ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC, P pD ) { F( 14 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); A( 9, p9 ); A( 10, pA ); A( 11, pB ); A( 12, pC ); A( 13, pD ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC, P pD, P pE ) { F( 15 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); A( 9, p9 ); A( 10, pA ); A( 11, pB ); A( 12, pC ); A( 13, pD ); A( 14, pE ); } void SetArguments( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC, P pD, P pE, P pF ) { F( 16 ); A( 0, p0 ); A( 1, p1 ); A( 2, p2 ); A( 3, p3 ); A( 4, p4 ); A( 5, p5 ); A( 6, p6 ); A( 7, p7 ); A( 8, p8 ); A( 9, p9 ); A( 10, pA ); A( 11, pB ); A( 12, pC ); A( 13, pD ); A( 14, pE ); A( 15, pF ); } void SetBlockSize( int x, int y = 1 ) { blockDimX = x; blockDimY = y; dimensionsChanged = true; } void SetGridSize( int x, int y = 1 ) { gridDimX = x; gridDimY = y; } void Launch(); void Launch( P p0 ) { SetArguments( p0 ); Launch(); } void Launch( P p0, P p1 ) { SetArguments( p0, p1 ); Launch(); } void Launch( P p0, P p1, P p2 ) { SetArguments( p0, p1, p2 ); Launch(); } void Launch( P p0, P p1, P p2, P p3 ) { SetArguments( p0, p1, p2, p3 ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4 ) { SetArguments( p0, p1, p2, p3, p4 ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5 ) { SetArguments( p0, p1, p2, p3, p4, p5 ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6 ) { SetArguments( p0, p1, p2, p3, p4, p5, p6 ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7 ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7 ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8 ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8 ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9 ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8, p9 ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC, P pD ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC, P pD, P pE ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE ); Launch(); } void Launch( P p0, P p1, P p2, P p3, P p4, P p5, P p6, P p7, P p8, P p9, P pA, P pB, P pC, P pD, P pE, P pF ) { SetArguments( p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF ); Launch(); } static void AssertFailed( int* data ); private: void A( int i, P x ) { param[i] = x; } void F( int n ) { paramCount = n; paramsChanged = true; } void FindKernel( const char* name ); private: CUDAModule* module; CUfunction kernel; GenericParameter param[MAXPARAMS]; unsigned char* paramBuffer; int paramCount, paramSize, blockDimX, blockDimY, gridDimX, gridDimY; bool firstLaunch, paramsChanged, dimensionsChanged, preferL1; }; }; // namespace cudalab #endif