/* Copyright (c) 2011 Wildfire Games * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "precompiled.h" #include "lib/sysdep/arch/x86_x64/cache.h" #include "lib/bits.h" #include "lib/alignment.h" #include "lib/module_init.h" #include "lib/sysdep/os_cpu.h" #include "lib/sysdep/arch/x86_x64/x86_x64.h" namespace x86_x64 { static const size_t maxTLBs = 2*2*4; // (level0, level1) x (D,I) x (4K, 2M, 4M, 1G) static size_t numTLBs = 0; static const size_t numCaches = x86_x64::Cache::maxLevels * 2 + maxTLBs; static Cache caches[numCaches]; static void AddCache(const x86_x64::Cache& cache) { ENSURE(cache.Validate()); if(cache.type == x86_x64::Cache::kData || cache.type == x86_x64::Cache::kUnified) caches[L1D + cache.level-1] = cache; if(cache.type == x86_x64::Cache::kInstruction || cache.type == x86_x64::Cache::kUnified) caches[L1I + cache.level-1] = cache; } static void AddTLB(const x86_x64::Cache& tlb) { ENSURE(tlb.Validate()); ENSURE(tlb.level == 1 || tlb.level == 2); // see maxTLBs ENSURE(numTLBs < maxTLBs); caches[TLB+numTLBs++] = tlb; } //----------------------------------------------------------------------------- // AMD // (Intel has subsequently added support for function 0x80000006, but // only returns ECX, i.e. L2 information.) namespace AMD { static x86_x64::Cache L1Cache(u32 reg, x86_x64::Cache::Type type) { x86_x64::Cache cache; cache.Initialize(1, type); const size_t lineSize = bits(reg, 0, 7); const size_t associativity = bits(reg, 16, 23); // 0 = reserved const size_t totalSize = bits(reg, 24, 31)*KiB; if(lineSize != 0 && associativity != 0 && totalSize != 0) { cache.numEntries = totalSize / lineSize; cache.entrySize = lineSize; cache.associativity = associativity; cache.sharedBy = 1; } return cache; } // applies to L2, L3 and TLB2 static const size_t associativityTable[16] = { 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, x86_x64::Cache::fullyAssociative }; static x86_x64::Cache L2Cache(u32 reg, x86_x64::Cache::Type type) { x86_x64::Cache cache; cache.Initialize(2, type); const size_t lineSize = bits(reg, 0, 7); const size_t idxAssociativity = bits(reg, 12, 15); // 0 = disabled const size_t totalSize = bits(reg, 16, 31)*KiB; if(lineSize != 0 && idxAssociativity != 0 && totalSize != 0) { cache.numEntries = totalSize / lineSize; cache.entrySize = lineSize; cache.associativity = associativityTable[idxAssociativity]; cache.sharedBy = 1; } return cache; } // (same as L2 except for the size) static x86_x64::Cache L3Cache(u32 reg, x86_x64::Cache::Type type) { x86_x64::Cache cache; cache.Initialize(3, type); const size_t lineSize = bits(reg, 0, 7); const size_t idxAssociativity = bits(reg, 12, 15); // 0 = disabled const size_t totalSize = bits(reg, 18, 31)*512*KiB; // (rounded down) // NB: some Athlon 64 X2 models have no L3 cache if(lineSize != 0 && idxAssociativity != 0 && totalSize != 0) { cache.numEntries = totalSize / lineSize; cache.entrySize = lineSize; cache.associativity = associativityTable[idxAssociativity]; cache.sharedBy = 1; } return cache; } static x86_x64::Cache TLB1(u32 reg, size_t bitOffset, size_t pageSize, x86_x64::Cache::Type type) { x86_x64::Cache cache; cache.Initialize(1, type); const size_t numEntries = bits(reg, bitOffset+0, bitOffset+ 7); const size_t associativity = bits(reg, bitOffset+8, bitOffset+15); // 0 = reserved if(numEntries != 0 && associativity != 0) { cache.numEntries = numEntries; cache.entrySize = pageSize; cache.associativity = associativity; cache.sharedBy = 1; } return cache; } static x86_x64::Cache TLB2(u32 reg, size_t bitOffset, size_t pageSize, x86_x64::Cache::Type type) { x86_x64::Cache cache; cache.Initialize(2, type); const size_t numEntries = bits(reg, bitOffset+ 0, bitOffset+11); const size_t idxAssociativity = bits(reg, bitOffset+12, bitOffset+15); // 0 = disabled if(numEntries != 0 && idxAssociativity != 0) { cache.numEntries = numEntries; cache.entrySize = pageSize; cache.associativity = associativityTable[idxAssociativity]; cache.sharedBy = 1; } return cache; } static void AddTLB2Pair(u32 reg, size_t pageSize) { x86_x64::Cache::Type type = x86_x64::Cache::kUnified; if(bits(reg, 16, 31) != 0) // not unified { AddTLB(TLB2(reg, 16, pageSize, x86_x64::Cache::kData)); type = x86_x64::Cache::kInstruction; } AddTLB(TLB2(reg, 0, pageSize, type)); } // AMD reports maxCpuidIdFunction > 4 but consider functions 2..4 to be // "reserved". cache characteristics are returned via ext. functions. static void DetectCacheAndTLB() { x86_x64::CpuidRegs regs = { 0 }; regs.eax = 0x80000005; if(x86_x64::cpuid(®s)) { AddCache(L1Cache(regs.ecx, x86_x64::Cache::kData)); AddCache(L1Cache(regs.edx, x86_x64::Cache::kInstruction)); AddTLB(TLB1(regs.eax, 0, 2*MiB, x86_x64::Cache::kInstruction)); AddTLB(TLB1(regs.eax, 16, 2*MiB, x86_x64::Cache::kData)); AddTLB(TLB1(regs.ebx, 0, 4*KiB, x86_x64::Cache::kInstruction)); AddTLB(TLB1(regs.ebx, 16, 4*KiB, x86_x64::Cache::kData)); } regs.eax = 0x80000006; if(x86_x64::cpuid(®s)) { AddCache(L2Cache(regs.ecx, x86_x64::Cache::kUnified)); AddCache(L3Cache(regs.edx, x86_x64::Cache::kUnified)); AddTLB2Pair(regs.eax, 2*MiB); AddTLB2Pair(regs.ebx, 4*KiB); } } } // namespace AMD //----------------------------------------------------------------------------- // CPUID.4 namespace CPUID4 { static bool DetectCache() { // note: level order is unspecified (see Intel AP-485) for(u32 count = 0; ; count++) { x86_x64::CpuidRegs regs = { 0 }; regs.eax = 4; regs.ecx = count; if(!x86_x64::cpuid(®s)) return false; const x86_x64::Cache::Type type = (x86_x64::Cache::Type)bits(regs.eax, 0, 4); if(type == x86_x64::Cache::kNull) // no more remaining break; const size_t level = (size_t)bits(regs.eax, 5, 7); const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1; const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1; x86_x64::Cache cache; cache.Initialize(level, type); cache.entrySize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding) cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1; cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1; cache.numEntries = cache.associativity * partitions * sets; AddCache(cache); } return true; } } // namespace CPUID4 //----------------------------------------------------------------------------- // CPUID.2 (descriptors) namespace CPUID2 { typedef u8 Descriptor; typedef std::vector Descriptors; static void AppendDescriptors(u32 reg, Descriptors& descriptors) { if(IsBitSet(reg, 31)) // register contents are reserved return; for(int pos = 24; pos >= 0; pos -= 8) { const u8 descriptor = (u8)bits(reg, pos, pos+7); if(descriptor != 0) descriptors.push_back(descriptor); } } static Descriptors GetDescriptors() { // ensure consistency by pinning to a CPU. // (don't use a hard-coded mask because process affinity may be restricted) const uintptr_t allProcessors = os_cpu_ProcessorMask(); const uintptr_t firstProcessor = allProcessors & -intptr_t(allProcessors); const uintptr_t prevAffinityMask = os_cpu_SetThreadAffinityMask(firstProcessor); x86_x64::CpuidRegs regs = { 0 }; regs.eax = 2; if(!x86_x64::cpuid(®s)) return Descriptors(); Descriptors descriptors; size_t iterations = bits(regs.eax, 0, 7); for(;;) // abort mid-loop (invoke CPUID exactly times) { AppendDescriptors(bits(regs.eax, 8, 31), descriptors); AppendDescriptors(regs.ebx, descriptors); AppendDescriptors(regs.ecx, descriptors); AppendDescriptors(regs.edx, descriptors); if(--iterations == 0) break; regs.eax = 2; const bool ok = x86_x64::cpuid(®s); ENSURE(ok); } os_cpu_SetThreadAffinityMask(prevAffinityMask); return descriptors; } // note: the following cannot be moved into a function because // ARRAY_SIZE's template argument must not reference a local type. enum Flags { // level (bits 0..1) L1 = 1, L2, L3, // type (bits 2..3) I = 0x04, // instruction D = 0x08, // data U = I|D // unified // largeSize (bits 4..31 with bits 0..3 zeroed): TLB entrySize or cache numEntries }; // (there are > 100 descriptors, so we squeeze all fields into 8 bytes.) struct Characteristics // POD { x86_x64::Cache::Type Type() const { switch(flags & U) { case D: return x86_x64::Cache::kData; case I: return x86_x64::Cache::kInstruction; case U: return x86_x64::Cache::kUnified; default: DEBUG_WARN_ERR(ERR::LOGIC); return x86_x64::Cache::kNull; } } size_t Level() const { const size_t level = flags & 3; ENSURE(level != 0); return level; } bool IsTLB() const { return smallSize >= 0; } size_t NumEntries() const { return IsTLB()? (size_t)smallSize : (flags & ~0xF); } size_t EntrySize() const { return IsTLB()? (flags & ~0xF) : (size_t)(-smallSize); } u8 descriptor; u8 associativity; i16 smallSize; // negative cache entrySize or TLB numEntries u32 flags; // level, type, largeSize }; static const u8 F = x86_x64::Cache::fullyAssociative; #define CACHE(descriptor, flags, totalSize, assoc, entrySize) { descriptor, assoc, -entrySize, flags | ((totalSize)/(entrySize)) } #define TLB(descriptor, flags, entrySize, assoc, numEntries) { descriptor, assoc, numEntries, flags | (entrySize) } // (we need to include cache descriptors because early Pentium4 don't implement CPUID.4) // references: [accessed 2011-02-26] // AP485 http://www.intel.com/Assets/PDF/appnote/241618.pdf // sdman http://www.intel.com/Assets/PDF/manual/253666.pdf // sandp http://www.sandpile.org/ia32/cpuid.htm // opsol http://src.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/i86pc/os/cpuid.c static const Characteristics characteristicsTable[] = { TLB (0x01, L1|I, 4*KiB, 4, 32), TLB (0x02, L1|I, 4*MiB, F, 2), TLB (0x03, L1|D, 4*KiB, 4, 64), TLB (0x04, L1|D, 4*MiB, 4, 8), TLB (0x05, L1|D, 4*MiB, 4, 32), CACHE(0x06, L1|I, 8*KiB, 4, 32), CACHE(0x08, L1|I, 16*KiB, 4, 32), CACHE(0x09, L1|I, 32*KiB, 4, 64), CACHE(0x0A, L1|I, 8*KiB, 2, 32), TLB (0x0B, L1|I, 4*MiB, 4, 4), CACHE(0x0C, L1|D, 16*KiB, 4, 32), CACHE(0x0D, L1|D, 16*KiB, 4, 64), // opsol: 32B (would be redundant with 0x0C), AP485: 64B, sdman: 64B CACHE(0x0E, L1|D, 24*KiB, 6, 64), CACHE(0x21, L2|U, 256*KiB, 8, 64), CACHE(0x22, L3|U, 512*KiB, 4, 64), CACHE(0x23, L3|U, 1*MiB, 8, 64), CACHE(0x25, L3|U, 2*MiB, 8, 64), CACHE(0x29, L3|U, 4*MiB, 8, 64), CACHE(0x2c, L1|D, 32*KiB, 8, 64), CACHE(0x30, L1|I, 32*KiB, 8, 64), CACHE(0x39, L2|U, 128*KiB, 4, 64), CACHE(0x3A, L2|U, 192*KiB, 6, 64), CACHE(0x3B, L2|U, 128*KiB, 2, 64), CACHE(0x3C, L2|U, 256*KiB, 4, 64), CACHE(0x3D, L2|U, 384*KiB, 6, 64), CACHE(0x3E, L2|U, 512*KiB, 4, 64), CACHE(0x41, L2|U, 128*KiB, 4, 32), CACHE(0x42, L2|U, 256*KiB, 4, 32), CACHE(0x43, L2|U, 512*KiB, 4, 32), CACHE(0x44, L2|U, 1*MiB, 4, 32), CACHE(0x45, L2|U, 2*MiB, 4, 32), CACHE(0x46, L3|U, 4*MiB, 4, 64), CACHE(0x47, L3|U, 8*MiB, 8, 64), CACHE(0x48, L2|U, 3*MiB, 12, 64), CACHE(0x49, L2|U, 4*MiB, 16, 64), CACHE(0x49, L3|U, 4*MiB, 16, 64), CACHE(0x4A, L3|U, 6*MiB, 12, 64), CACHE(0x4B, L3|U, 8*MiB, 16, 64), CACHE(0x4C, L3|U, 12*MiB, 12, 64), CACHE(0x4D, L3|U, 16*MiB, 16, 64), CACHE(0x4E, L2|U, 6*MiB, 24, 64), TLB (0x4F, L1|I, 4*KiB, F, 32), // sandp: unknown assoc, opsol: full, AP485: unspecified TLB (0x50, L1|I, 4*KiB, F, 64), TLB (0x50, L1|I, 4*MiB, F, 64), TLB (0x50, L1|I, 2*MiB, F, 64), TLB (0x51, L1|I, 4*KiB, F, 128), TLB (0x51, L1|I, 4*MiB, F, 128), TLB (0x51, L1|I, 2*MiB, F, 128), TLB (0x52, L1|I, 4*KiB, F, 256), TLB (0x52, L1|I, 4*MiB, F, 256), TLB (0x52, L1|I, 2*MiB, F, 256), TLB (0x55, L1|I, 4*MiB, F, 7), TLB (0x55, L1|I, 2*MiB, F, 7), TLB (0x56, L1|D, 4*MiB, 4, 16), TLB (0x57, L1|D, 4*KiB, 4, 16), TLB (0x59, L1|D, 4*KiB, F, 16), TLB (0x5A, L1|D, 4*MiB, 4, 32), TLB (0x5A, L1|D, 2*MiB, 4, 32), TLB (0x5B, L1|D, 4*KiB, F, 64), TLB (0x5B, L1|D, 4*MiB, F, 64), TLB (0x5C, L1|D, 4*KiB, F, 128), TLB (0x5C, L1|D, 4*MiB, F, 128), TLB (0x5D, L1|D, 4*KiB, F, 256), TLB (0x5D, L1|D, 4*MiB, F, 256), CACHE(0x60, L1|D, 16*KiB, 8, 64), TLB (0x63, L1|D, 1*GiB, 4, 4), // speculation CACHE(0x66, L1|D, 8*KiB, 4, 64), CACHE(0x67, L1|D, 16*KiB, 4, 64), CACHE(0x68, L1|D, 32*KiB, 4, 64), CACHE(0x70, L1|I, 12*KiB, 8, 1), CACHE(0x71, L1|I, 16*KiB, 8, 1), CACHE(0x72, L1|I, 32*KiB, 8, 1), CACHE(0x73, L1|I, 64*KiB, 8, 1), TLB (0x76, L1|I, 4*MiB, F, 8), // AP485: internally inconsistent, sdman: TLB TLB (0x76, L1|I, 2*MiB, F, 8), CACHE(0x78, L2|U, 1*MiB, 4, 64), CACHE(0x79, L2|U, 128*KiB, 8, 64), CACHE(0x7A, L2|U, 256*KiB, 8, 64), CACHE(0x7B, L2|U, 512*KiB, 8, 64), CACHE(0x7C, L2|U, 1*MiB, 8, 64), CACHE(0x7D, L2|U, 2*MiB, 8, 64), CACHE(0x7F, L2|U, 512*KiB, 2, 64), CACHE(0x80, L2|U, 512*KiB, 8, 64), CACHE(0x82, L2|U, 256*KiB, 8, 32), CACHE(0x83, L2|U, 512*KiB, 8, 32), CACHE(0x84, L2|U, 1*MiB, 8, 32), CACHE(0x85, L2|U, 2*MiB, 8, 32), CACHE(0x86, L2|U, 512*KiB, 4, 64), CACHE(0x87, L2|U, 1*MiB, 8, 64), TLB (0xB0, L1|I, 4*KiB, 4, 128), TLB (0xB1, L1|I, 2*MiB, 4, 8), TLB (0xB1, L1|I, 4*MiB, 4, 4), TLB (0xB2, L1|I, 4*KiB, 4, 64), TLB (0xB3, L1|D, 4*KiB, 4, 128), TLB (0xB3, L1|D, 4*MiB, 4, 128), TLB (0xB4, L1|D, 4*KiB, 4, 256), TLB (0xB4, L1|D, 4*MiB, 4, 256), TLB (0xB5, L1|I, 4*KiB, 4, 128), // speculation TLB (0xB6, L1|I, 4*KiB, 8, 128), // http://software.intel.com/en-us/forums/topic/401012 TLB (0xBA, L1|D, 4*KiB, 4, 64), TLB (0xC0, L1|D, 4*KiB, 4, 8), TLB (0xC0, L1|D, 4*MiB, 4, 8), TLB (0xC1, L2|U, 4*KiB, 8, 1024), // http://software.intel.com/en-us/forums/topic/401012 TLB (0xC1, L2|U, 4*MiB, 8, 1024), TLB (0xC1, L2|U, 2*MiB, 8, 1024), TLB (0xCA, L2|U, 4*KiB, 4, 512), CACHE(0xD0, L3|U, 512*KiB, 4, 64), CACHE(0xD1, L3|U, 1*MiB, 4, 64), CACHE(0xD2, L3|U, 2*MiB, 4, 64), CACHE(0xD6, L3|U, 1*MiB, 8, 64), CACHE(0xD7, L3|U, 2*MiB, 8, 64), CACHE(0xD8, L3|U, 4*MiB, 8, 64), CACHE(0xDC, L3|U, 3*MiB/2, 12, 64), CACHE(0xDD, L3|U, 3*MiB, 12, 64), CACHE(0xDE, L3|U, 6*MiB, 12, 64), CACHE(0xE2, L3|U, 2*MiB, 16, 64), CACHE(0xE3, L3|U, 4*MiB, 16, 64), CACHE(0xE4, L3|U, 8*MiB, 16, 64), CACHE(0xEA, L3|U, 12*MiB, 24, 64), CACHE(0xEB, L3|U, 18*MiB, 24, 64), CACHE(0xEC, L3|U, 24*MiB, 24, 64), }; #undef CACHE #undef TLB static const Characteristics* CharacteristicsFromDescriptor(Descriptor descriptor) { // note: we can't use bsearch because characteristicsTable contains multiple // entries with the same descriptor. for(size_t i = 0; i < ARRAY_SIZE(characteristicsTable); i++) { const Characteristics& characteristics = characteristicsTable[i]; if(characteristics.descriptor == descriptor) return &characteristics; } debug_printf("Unknown cache/TLB descriptor 0x%x\n", (unsigned int)descriptor); return 0; } enum DescriptorFlags { SKIP_CACHE_DESCRIPTORS = 1, NO_LAST_LEVEL_CACHE = 2, PREFETCH64 = 64, PREFETCH128 = 128 }; static bool HandleSpecialDescriptor(Descriptor descriptor, size_t& descriptorFlags) { switch(descriptor) { case 0: // carries no information return true; case 0x40: descriptorFlags |= NO_LAST_LEVEL_CACHE; return true; case 0xF0: descriptorFlags |= PREFETCH64; return true; case 0xF1: descriptorFlags |= PREFETCH128; return true; case 0xFF: // descriptors don't include caches (use CPUID.4 instead) descriptorFlags |= SKIP_CACHE_DESCRIPTORS; return true; default: return false; } } static void DetectCacheAndTLB(size_t& descriptorFlags) { const Descriptors descriptors = GetDescriptors(); for(Descriptors::const_iterator it = descriptors.begin(); it != descriptors.end(); ++it) { const Descriptor descriptor = *it; if(HandleSpecialDescriptor(descriptor, descriptorFlags)) continue; const Characteristics* characteristics = CharacteristicsFromDescriptor(*it); if(!characteristics) continue; if((descriptorFlags & SKIP_CACHE_DESCRIPTORS) && !characteristics->IsTLB()) continue; x86_x64::Cache cache; cache.Initialize(characteristics->Level(), characteristics->Type()); cache.numEntries = characteristics->NumEntries(); cache.entrySize = characteristics->EntrySize(); cache.associativity = characteristics->associativity; cache.sharedBy = 1; // (safe default) if(characteristics->IsTLB()) AddTLB(cache); else AddCache(cache); } } } // namespace CPUID2 static Status DetectCacheAndTLB() { // ensure all cache entries are initialized (DetectCache* might not set them all) for(size_t idxLevel = 0; idxLevel < x86_x64::Cache::maxLevels; idxLevel++) { caches[L1D+idxLevel].Initialize(idxLevel+1, x86_x64::Cache::kData); caches[L1I+idxLevel].Initialize(idxLevel+1, x86_x64::Cache::kInstruction); } if(x86_x64::Vendor() == x86_x64::VENDOR_AMD) AMD::DetectCacheAndTLB(); else { size_t descriptorFlags = 0; if(CPUID4::DetectCache()) // success, ignore less reliable CPUID.2 cache information descriptorFlags |= CPUID2::SKIP_CACHE_DESCRIPTORS; CPUID2::DetectCacheAndTLB(descriptorFlags); } // sanity checks for(size_t idxLevel = 0; idxLevel < x86_x64::Cache::maxLevels; idxLevel++) { ENSURE(caches[L1D+idxLevel].type == x86_x64::Cache::kData || caches[L1D+idxLevel].type == x86_x64::Cache::kUnified); ENSURE(caches[L1D+idxLevel].level == idxLevel+1); ENSURE(caches[L1D+idxLevel].Validate() == true); ENSURE(caches[L1I+idxLevel].type == x86_x64::Cache::kInstruction || caches[L1I+idxLevel].type == x86_x64::Cache::kUnified); ENSURE(caches[L1I+idxLevel].level == idxLevel+1); ENSURE(caches[L1I+idxLevel].Validate() == true); } for(size_t i = 0; i < numTLBs; i++) ENSURE(caches[TLB+i].Validate() == true); return INFO::OK; } const x86_x64::Cache* Caches(size_t idxCache) { static ModuleInitState initState; ModuleInit(&initState, DetectCacheAndTLB); if(idxCache >= TLB+numTLBs) return 0; return &caches[idxCache]; } } // namespace x86_x64