From f33ef2ee854d24b2a967f04e6b38df1969e8879d Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 5 May 2019 07:45:32 +0900 Subject: [PATCH] update xbyak to 5.79 --- src/xbyak/xbyak.h | 73 ++++++++++++----- src/xbyak/xbyak_mnemonic.h | 10 +-- src/xbyak/xbyak_util.h | 163 +++++++++++++++++++++++++++---------- 3 files changed, 177 insertions(+), 69 deletions(-) diff --git a/src/xbyak/xbyak.h b/src/xbyak/xbyak.h index bcfeb34..c28a536 100644 --- a/src/xbyak/xbyak.h +++ b/src/xbyak/xbyak.h @@ -113,7 +113,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x5751 /* 0xABCD = A.BC(D) */ + VERSION = 0x5790 /* 0xABCD = A.BC(D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -186,7 +186,8 @@ enum { ERR_INVALID_ZERO, ERR_INVALID_RIP_IN_AUTO_GROW, ERR_INVALID_MIB_ADDRESS, - ERR_INTERNAL + ERR_INTERNAL, + ERR_X2APIC_IS_NOT_SUPPORTED }; class Error : public std::exception { @@ -248,6 +249,7 @@ public: "invalid rip in AutoGrow", "invalid mib address", "internal error", + "x2APIC is not supported" }; assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl)); return errTbl[err_]; @@ -431,7 +433,8 @@ public: kind_ = kind; bit_ = kind == XMM ? 128 : kind == YMM ? 256 : 512; } - void setBit(int bit) { bit_ = bit; } + // err if MMX/FPU/OPMASK/BNDREG + void setBit(int bit); void setOpmaskIdx(int idx, bool ignore_idx0 = false) { if (!ignore_idx0 && idx == 0) throw Error(ERR_K0_IS_INVALID); @@ -514,6 +517,48 @@ public: const Reg& getReg() const; }; +inline void Operand::setBit(int bit) +{ + if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512) goto ERR; + if (isBit(bit)) return; + if (is(MEM)) { + bit_ = bit; + return; + } + if (is(REG | XMM | YMM | ZMM)) { + int idx = getIdx(); + // err if converting ah, bh, ch, dh + if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR; + Kind kind = REG; + switch (bit) { + case 8: + if (idx >= 16) goto ERR; +#ifdef XBYAK32 + if (idx >= 4) goto ERR; +#else + if (4 <= idx && idx < 8) idx |= EXT8BIT; +#endif + break; + case 16: + case 32: + case 64: + if (idx >= 16) goto ERR; + break; + case 128: kind = XMM; break; + case 256: kind = YMM; break; + case 512: kind = ZMM; break; + } + idx_ = idx; + kind_ = kind; + bit_ = bit; + mask_ = 0; + rounding_ = 0; + return; + } +ERR: + throw Error(ERR_CANT_CONVERT); +} + class Label; struct Reg8; @@ -526,7 +571,8 @@ class Reg : public Operand { public: Reg() { } Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) { } - Reg changeBit(int bit) const { return Reg(getIdx(), getKind(), bit, isExt8bit()); } + // convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM + Reg changeBit(int bit) const { Reg r(*this); r.setBit(bit); return r; } uint8 getRexW() const { return isREG(64) ? 8 : 0; } uint8 getRexR() const { return isExtIdx() ? 4 : 0; } uint8 getRexX() const { return isExtIdx() ? 2 : 0; } @@ -650,34 +696,23 @@ struct RegRip { inline Reg8 Reg::cvt8() const { - const int idx = getIdx(); - if (isBit(8)) return Reg8(idx, isExt8bit()); -#ifdef XBYAK32 - if (idx >= 4) throw Error(ERR_CANT_CONVERT); -#endif - return Reg8(idx, 4 <= idx && idx < 8); + Reg r = changeBit(8); return Reg8(r.getIdx(), r.isExt8bit()); } inline Reg16 Reg::cvt16() const { - const int idx = getIdx(); - if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT); - return Reg16(idx); + return Reg16(changeBit(16).getIdx()); } inline Reg32 Reg::cvt32() const { - const int idx = getIdx(); - if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT); - return Reg32(idx); + return Reg32(changeBit(32).getIdx()); } #ifdef XBYAK64 inline Reg64 Reg::cvt64() const { - const int idx = getIdx(); - if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT); - return Reg64(idx); + return Reg64(changeBit(64).getIdx()); } #endif diff --git a/src/xbyak/xbyak_mnemonic.h b/src/xbyak/xbyak_mnemonic.h index 766f2f6..2733c61 100644 --- a/src/xbyak/xbyak_mnemonic.h +++ b/src/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "5.751"; } +const char *getVersionString() const { return "5.79"; } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -1676,8 +1676,8 @@ void vbroadcasti32x4(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_ void vbroadcasti32x8(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B); } void vbroadcasti64x2(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A); } void vbroadcasti64x4(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B); } -void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); } -void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); } +void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm); } +void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm); } void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); } void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); } void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63); } @@ -1725,8 +1725,8 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { o void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm); } void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm); } void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm); } -void vfpclasspd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } -void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } +void vfpclasspd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } +void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } void vfpclasssd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm); } void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); } void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1); } diff --git a/src/xbyak/xbyak_util.h b/src/xbyak/xbyak_util.h index 0154450..c2474c5 100644 --- a/src/xbyak/xbyak_util.h +++ b/src/xbyak/xbyak_util.h @@ -9,6 +9,11 @@ */ #include "xbyak.h" +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + #define XBYAK_INTEL_CPU_SPECIFIC +#endif + +#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef _MSC_VER #if (_MSC_VER < 1400) && defined(XBYAK32) static inline __declspec(naked) void __cpuid(int[4], int) @@ -47,14 +52,30 @@ #endif #endif #endif +#endif namespace Xbyak { namespace util { +typedef enum { + SmtLevel = 1, + CoreLevel = 2 +} IntelCpuTopologyLevel; + /** CPU detection class */ class Cpu { uint64 type_; + //system topology + bool x2APIC_supported_; + static const size_t maxTopologyLevels = 2; + unsigned int numCores_[maxTopologyLevels]; + + static const unsigned int maxNumberCacheLevels = 10; + unsigned int dataCacheSize_[maxNumberCacheLevels]; + unsigned int coresSharignDataCache_[maxNumberCacheLevels]; + unsigned int dataCacheLevels_; + unsigned int get32bitAsBE(const char *x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); @@ -65,7 +86,7 @@ class Cpu { } void setFamily() { - unsigned int data[4]; + unsigned int data[4] = {}; getCpuid(1, data); stepping = data[0] & mask(4); model = (data[0] >> 4) & mask(4); @@ -88,6 +109,39 @@ class Cpu { { return (val >> base) & ((1u << (end - base)) - 1); } + void setNumCores() + { + if ((type_ & tINTEL) == 0) return; + + unsigned int data[4] = {}; + + /* CAUTION: These numbers are configuration as shipped by Intel. */ + getCpuidEx(0x0, 0, data); + if (data[0] >= 0xB) { + /* + if leaf 11 exists(x2APIC is supported), + we use it to get the number of smt cores and cores on socket + + leaf 0xB can be zeroed-out by a hypervisor + */ + x2APIC_supported_ = true; + for (unsigned int i = 0; i < maxTopologyLevels; i++) { + getCpuidEx(0xB, i, data); + IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); + if (level == SmtLevel || level == CoreLevel) { + numCores_[level - 1] = extractBit(data[1], 0, 15); + } + } + } else { + /* + Failed to deremine num of cores without x2APIC support. + TODO: USE initial APIC ID to determine ncores. + */ + numCores_[SmtLevel - 1] = 0; + numCores_[CoreLevel - 1] = 0; + } + + } void setCacheHierarchy() { if ((type_ & tINTEL) == 0) return; @@ -96,21 +150,12 @@ class Cpu { // const unsigned int INSTRUCTION_CACHE = 2; const unsigned int UNIFIED_CACHE = 3; unsigned int smt_width = 0; - unsigned int n_cores = 0; - unsigned int data[4]; - - /* - if leaf 11 exists, we use it to get the number of smt cores and cores on socket - If x2APIC is supported, these are the only correct numbers. + unsigned int logical_cores = 0; + unsigned int data[4] = {}; - leaf 0xB can be zeroed-out by a hypervisor - */ - getCpuidEx(0x0, 0, data); - if (data[0] >= 0xB) { - getCpuidEx(0xB, 0, data); // CPUID for SMT Level - smt_width = data[1] & 0x7FFF; - getCpuidEx(0xB, 1, data); // CPUID for CORE Level - n_cores = data[1] & 0x7FFF; + if (x2APIC_supported_) { + smt_width = numCores_[0]; + logical_cores = numCores_[1]; } /* @@ -118,29 +163,29 @@ class Cpu { the first level of data cache is not shared (which is the case for every existing architecture) and use this to determine the SMT width for arch not supporting leaf 11. - when leaf 4 reports a number of core less than n_cores + when leaf 4 reports a number of core less than numCores_ on socket reported by leaf 11, then it is a correct number of cores not an upperbound. */ - for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) { + for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { getCpuidEx(0x4, i, data); unsigned int cacheType = extractBit(data[0], 0, 4); if (cacheType == NO_CACHE) break; if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { - unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1; - if (n_cores != 0) { // true only if leaf 0xB is supported and valid - nb_logical_cores = (std::min)(nb_logical_cores, n_cores); + unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; + if (logical_cores != 0) { // true only if leaf 0xB is supported and valid + actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); } - assert(nb_logical_cores != 0); - data_cache_size[data_cache_levels] = + assert(actual_logical_cores != 0); + dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) * (extractBit(data[1], 0, 11) + 1) * (data[2] + 1); - if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores; + if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; assert(smt_width != 0); - cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u); - data_cache_levels++; + coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); + dataCacheLevels_++; } } } @@ -154,22 +199,25 @@ public: int displayFamily; // family + extFamily int displayModel; // model + extModel - // may I move these members into private? - static const unsigned int maxNumberCacheLevels = 10; - unsigned int data_cache_size[maxNumberCacheLevels]; - unsigned int cores_sharing_data_cache[maxNumberCacheLevels]; - unsigned int data_cache_levels; + unsigned int getNumCores(IntelCpuTopologyLevel level) { + if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED); + switch (level) { + case SmtLevel: return numCores_[level - 1]; + case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1]; + default: throw Error(ERR_X2APIC_IS_NOT_SUPPORTED); + } + } - unsigned int getDataCacheLevels() const { return data_cache_levels; } + unsigned int getDataCacheLevels() const { return dataCacheLevels_; } unsigned int getCoresSharingDataCache(unsigned int i) const { - if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); - return cores_sharing_data_cache[i]; + if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); + return coresSharignDataCache_[i]; } unsigned int getDataCacheSize(unsigned int i) const { - if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); - return data_cache_size[i]; + if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); + return dataCacheSize_[i]; } /* @@ -177,30 +225,45 @@ public: */ static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER __cpuid(reinterpret_cast(data), eaxIn); -#else + #else __cpuid(eaxIn, data[0], data[1], data[2], data[3]); + #endif +#else + (void)eaxIn; + (void)data; #endif } static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER __cpuidex(reinterpret_cast(data), eaxIn, ecxIn); -#else + #else __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); + #endif +#else + (void)eaxIn; + (void)ecxIn; + (void)data; #endif } static inline uint64 getXfeature() { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER return _xgetbv(0); -#else + #else unsigned int eax, edx; // xgetvb is not support on gcc 4.2 // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); return ((uint64)edx << 32) | eax; + #endif +#else + return 0; #endif } typedef uint64 Type; @@ -271,9 +334,13 @@ public: Cpu() : type_(NONE) - , data_cache_levels(0) + , x2APIC_supported_(false) + , numCores_() + , dataCacheSize_() + , coresSharignDataCache_() + , dataCacheLevels_(0) { - unsigned int data[4]; + unsigned int data[4] = {}; const unsigned int& EAX = data[0]; const unsigned int& EBX = data[1]; const unsigned int& ECX = data[2]; @@ -363,6 +430,7 @@ public: if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; } setFamily(); + setNumCores(); setCacheHierarchy(); } void putFamily() const @@ -381,12 +449,17 @@ class Clock { public: static inline uint64 getRdtsc() { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER return __rdtsc(); -#else + #else unsigned int eax, edx; __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx)); return ((uint64)edx << 32) | eax; + #endif +#else + // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu + return 0; #endif } Clock()