#ifndef MIE_ETAT_IMPLE_H_ #define MIE_ETAT_IMPLE_H_ #include "xbyak/xbyak.h" using namespace Xbyak; using namespace MIE_ETAT; class GeneratorForF3_97 : public Xbyak::CodeGenerator { public: GeneratorForF3_97() : CodeGenerator(1024 * 16) { } void createOptimizedMulLatter(int bit, int w) { const int sep = 32 / w; const int topBit = ((w - 1) / bit) * bit; const int firstShiftBit = w - topBit < bit ? w - topBit : bit; const int loop = (w + bit - 1) / bit; const int idxTblSize = loop * 4 * 4 + 16; push(esi); const int _P = 4 * 1; mov(edx, ptr [esp + _P + 8]); // xTbl mov(ecx, ptr [esp + _P + 12]); // b sub(esp, idxTblSize); mov(esi, esp); and(esi, uint32(~15)); // idxTbl /* w = 8, b = 4のとき45clk程度 */ makeIdxTbl(bit, w, firstShiftBit, topBit, ecx, esi); const Xmm* L0(&xm0); const Xmm* L1(&xm1); const Xmm* H0(&xm2); const Xmm* H1(&xm3); mov(eax, ptr [ecx + 16 + 12]); // b.get(m - 1).high add(eax, eax); add(eax, ptr [ecx + 12]); // b.get(m - 1).low shl(eax, 2); movaps(*L0, ptr [edx + eax * 8]); movaps(*H0, ptr [edx + eax * 8 + 16]); /* x << (96 - w + firstShiftBit) for x = [0:L0], [0:H0], w = 8, 16, 32 */ const int sb = (96 - w) / 8; xorps(*L1, *L1); xorps(*H1, *H1); xorps(xm4, xm4); xorps(xm5, xm5); palignr(*L1, *L0, 15 - sb); palignr(*H1, *H0, 15 - sb); palignr(xm4, *L0, 16 - sb); palignr(xm5, *H0, 16 - sb); psrlq(*L1, 8 - firstShiftBit); psrlq(*H1, 8 - firstShiftBit); psllq(xm4, firstShiftBit); psllq(xm5, firstShiftBit); pslldq(*L0, sb); pslldq(*H0, sb); orps(*L1, xm4); orps(*H1, xm5); psllq(*L0, firstShiftBit); psllq(*H0, firstShiftBit); for (int i = 0; i < loop; i++) { movzx(eax, byte [esi]); shl(eax, 2); movaps(xm4, ptr [edx + eax * 8]); // xTbl[i][0] movaps(xm5, ptr [edx + eax * 8 + 16]); add16D(*L0, *H0, xm4, xm5, xm6); int u = w / 8; for (int j = 1; j < sep * 3; j++) { movzx(eax, byte [esi + j * (w / 8)]); shl(eax, 2); ror256X(*L0, *L1, xm6, u); std::swap(L0, L1); ror256X(*H0, *H1, xm7, u); std::swap(H0, H1); movaps(xm4, ptr [edx + eax * 8]); // xTbl[i][1] movaps(xm5, ptr [edx + eax * 8 + 16]); add16D(*L0, *H0, xm4, xm5, xm6); } rol256(*L0, *L1, xm6, 12 - u); rol256(*H0, *H1, xm7, 12 - u); if (i < loop - 1) { shlBit(*L0, *L1, xm6, bit); shlBit(*H0, *H1, xm7, bit); add(esi, 16); } } reduce(*L0, *L1, *H0, *H1, xm4, xm5, xm6); add(esp, idxTblSize); mov(eax, ptr [esp + _P + 4]); // out movaps(ptr [eax + 0], *L0); movaps(ptr [eax + 16], *H0); pop(esi); ret(); } void createOptimizedMakePrevMulTbl(int bit) { mov(eax, ptr [esp + 4]); // xTbl mov(edx, ptr [esp + 8]); // x movaps(xm0, ptr [edx]); // xL movaps(xm1, ptr [edx + 16]); // xH xorps(xm2, xm2); setTbl(0, xm2, xm2); // 0 setTbl(1, xm0, xm1); // x setTbl(2, xm1, xm0); // -[H:L] = [L:H] if (bit > 1) { shlBit(xm0, xm1, xm2, 1); setTbl(3, xm0, xm1); movaps(xm5, ptr [eax + 3 * 32]); movaps(xm7, xm5); movaps(xm6, ptr [eax + 3 * 32 + 16]); setTbl(6, xm6, xm5); // -[3] orps(xm7, xm6); add0(4, 1, xm5, xm6, xm0, xm1, xm2, xm7); add0(5, 2, xm5, xm6, xm0, xm1, xm2, xm7); } for (int b = 2; b < bit; b++) { makeTbl(b); } ret(); } void createOptimizedDegree() { mov(ecx, ptr [esp + 4]); // this mov(edx, ptr [ecx + 12]); or(edx, ptr [ecx + 16 + 12]); jnz("exit96"); mov(edx, ptr [ecx + 8]); or(edx, ptr [ecx + 16 + 8]); jnz("exit64"); mov(edx, ptr [ecx + 4]); or(edx, ptr [ecx + 16 + 4]); jnz("exit32"); mov(edx, ptr [ecx]); or(edx, ptr [ecx + 16]); bsr(eax, edx); cmovz(eax, edx); ret(); L("exit32"); bsr(eax, edx); add(eax, 32); ret(); L("exit64"); bsr(eax, edx); add(eax, 64); ret(); L("exit96"); mov(eax, 96); ret(); } void rawShl() { mov(eax, ptr [esp + 4]); // this mov(edx, ptr [esp + 8]); // bit movaps(xm0, ptr [eax]); // L movaps(xm1, ptr [eax + 16]); // H cmp(edx, 64); jb("bit_lt_64"); pslldq(xm0, 8); pslldq(xm1, 8); sub(edx, 64); L("bit_lt_64"); movd(xm6, edx); // bit neg(edx); add(edx, 64); movd(xm7, edx); // 64 - bit xorps(xm2, xm2); xorps(xm3, xm3); movlhps(xm2, xm0); movlhps(xm3, xm1); psllq(xm0, xm6); psllq(xm1, xm6); psrlq(xm2, xm7); psrlq(xm3, xm7); orps(xm0, xm2); orps(xm1, xm3); movaps(ptr [eax], xm0); movaps(ptr [eax + 16], xm1); ret(); } void rawShl(int bit) { assert(0 < bit && bit < 64); mov(eax, ptr [esp + 4]); // this movaps(xm0, ptr [eax]); // L movaps(xm1, ptr [eax + 16]); // H xorps(xm2, xm2); xorps(xm3, xm3); movlhps(xm2, xm0); movlhps(xm3, xm1); psllq(xm0, bit); psllq(xm1, bit); psrlq(xm2, 64 - bit); psrlq(xm3, 64 - bit); orps(xm0, xm2); orps(xm1, xm3); movaps(ptr [eax], xm0); movaps(ptr [eax + 16], xm1); ret(); } void createOptimizedRawShl(int type) { switch (type) { case 0: rawShl(); break; case 1: case 2: rawShl(type); break; } } void createOptimizedCube() { push(esi); const int _P = 4 * 1; mov(esi, ptr [esp + _P + 4]); // x mov(ecx, (int)MIE_ETAT::cubeTbl); cubeCore(esi, 0, ecx, eax, edx); pop(esi); ret(); } void createOptimizedCubeSeq() { const int m = 97; const int lpN = (m + 1) / 2; push(esi); push(edi); const int _P = 4 * 2; mov(esi, ptr [esp + _P + 4]); // x mov(ecx, (int)MIE_ETAT::cubeTbl); mov(edi, lpN - 1); L("lp"); cubeCore(esi, 32, ecx, eax, edx); add(esi, 32); sub(edi, 1); jnz("lp"); pop(edi); pop(esi); ret(); } void createOptimizedCubeRoot() { push(ebx); push(esi); const int _P = 4 * 2; mov(esi, ptr [esp + _P + 4]); const int statckSize = 16 * 6 + 16; sub(esp, statckSize); mov(ebx, esp); and(ebx, uint32(~15)); cubeRootCore(esi, 0, ebx, eax, ecx, edx); add(esp, statckSize); pop(esi); pop(ebx); ret(); } void createOptimizedCubeRootSeq() { const int m = 97; const int lpN = (m + 1) / 2; push(ebx); push(esi); push(edi); const int _P = 4 * 3; mov(esi, ptr [esp + _P + 4]); mov(edi, lpN - 1); const int statckSize = 16 * 6 + 16; sub(esp, statckSize); mov(ebx, esp); and(ebx, uint32(~15)); L("lp"); cubeRootCore(esi, 32, ebx, eax, ecx, edx); add(esi, 32); sub(edi, 1); jnz("lp"); add(esp, statckSize); pop(edi); pop(esi); pop(ebx); ret(); } private: uint8 pack(uint32 a, uint32 b, uint32 c, uint32 d) { return static_cast((a << 6) | (b << 4) | (c << 2) | d); } /* swap 1 and 2 for each bit ex. 1102(3) -> 2201(3) */ uint32 getNegIdx(uint32 x) { uint32 ret = 0; uint32 i = 1; while (x > 0) { uint32 r = x % 3; if (r) { ret += (3 - r) * i; } i *= 3; x /= 3; } return ret; } /* input : eax(xTbl), edx(x) */ void makeTbl(int b) { const int n = intpow3(b); movaps(xm5, ptr [eax + (n / 3) * 32]); movaps(xm6, ptr [eax + (n / 3) * 32 + 16]); shlBit(xm5, xm6, xm7, 1); setTbl(n, xm5, xm6); setTbl(getNegIdx(n), xm6, xm5); movaps(xm7, xm5); orps(xm7, xm6); for (int i = 0; i < n - 1; i++) { add0(i + n + 1, i + 1, xm5, xm6, xm0, xm1, xm2, xm7); } } /* [L:H] = ror_n([H:L]) */ void ror256X(const Xmm& L, const Xmm& H, const Xmm& t, int nByte) { assert(nByte < 16); movaps(t, H); palignr(H, L, nByte); palignr(L, t, nByte); } void rol256(const Xmm& L, const Xmm& H, const Xmm& t, int nByte) { assert(nByte < 16); movaps(t, H); palignr(H, L, 16 - nByte); palignr(L, t, 16 - nByte); } /* input : [H:L], [bH:bL] destroy : t0, t1, t2, t3, tr0, tr1, [bH:bL] output : [H:L] for (int i = 0; i < 8; i++) { [H:L] += addr[i][getIdxBit4W32([bH:bL], i * 4)]; } */ void addLow(const Xmm& L, const Xmm& H, const Xmm& bL, const Xmm& bH, const Xmm& t0, const Xmm& t1, const Xmm& t2, const Reg32& addr, const Reg32& tr0, const Reg32& tr1) { static const uint8 idxTbl[16] = { 0, 1, 3, 4, 9, 10, 12, 13, 27, 28, 30, 31, 36, 37, 39, 40 }; for (int i = 0; i < 8; i++) { movd(tr0, bL); movd(tr1, bH); psrlq(bL, 4); psrlq(bH, 4); and(tr0, 0xf); and(tr1, 0xf); movzx(tr0, byte [tr0 + (int)idxTbl]); movzx(tr1, byte [tr1 + (int)idxTbl]); lea(tr0, ptr [tr0 + tr1 * 2]); // L * H * 2 shl(tr0, 5); movaps(t0, ptr [addr + tr0 + i * intpow3T::val * 32]); movaps(t1, ptr [addr + tr0 + i * intpow3T::val * 32 + 16]); add16D(L, H, t0, t1, t2); } } /* input : L, H destroy : t0, t1, t2, t3, tr output : outL, outH */ void compress(const Xmm& outL, const Xmm& outH, const Xmm& L, const Xmm& H, const Xmm& t0, const Xmm& t1, const Xmm& t2, const Xmm& t3, const Reg32& tr) { MIE_ALIGN(16) static const uint32 m[] = { 0x49249249, 0x49249249, 0x49249249, 0x49249249, // + 0 0x403003, 0x403003, 0x403003, 0x403003, // 16 0x180C00C, 0x180C00C, 0x180C00C, 0x180C00C, // 32 0xf, 0xf, 0xf, 0xf, // 48 0xf0, 0xf0, 0xf0, 0xf0, // 64 0xf00, 0xf00, 0xf00, 0xf00, // 80 }; mov(tr, (int)m); movaps(outL, L); movaps(outH, H); andps(outL, ptr[tr]); andps(outH, ptr[tr]); movaps(t0, outL); movaps(t1, outH); psrld(t0, 2); psrld(t1, 2); orps(outL, t0); // x |= x >> 2 orps(outH, t1); movaps(t0, outL); movaps(t1, outH); psrld(outL, 4); psrld(outH, 4); andps(t0, ptr [tr + 16]); andps(t1, ptr [tr + 16]); andps(outL, ptr [tr + 32]); andps(outH, ptr [tr + 32]); orps(outL, t0); orps(outH, t1); movaps(t0, outL); movaps(t1, outH); movaps(t2, outL); movaps(t3, outH); andps(outL, ptr [tr + 48]); andps(outH, ptr [tr + 48]); psrld(t0, 8); psrld(t1, 8); psrld(t2, 14); psrld(t3, 14); andps(t0, ptr [tr + 64]); andps(t1, ptr [tr + 64]); orps(outL, t0); orps(outH, t1); andps(t2, ptr [tr + 80]); andps(t3, ptr [tr + 80]); orps(outL, t2); orps(outH, t3); } static void printHex(const uint32 *p, const char *msg, int len) { if (msg) printf("%s ", msg); for (int i = 0; i < len; i++) { printf("%08x:", p[len - 1 - i]); } printf("\n"); } void debug(const Operand& x, const char *msg = 0) { push(eax); push(ecx); push(edx); MIE_ALIGN(16) static uint32 buf[4]; if (x.isREG(32)) { mov(ptr [buf], x); push(1); } else if (x.isXMM()) { movaps(ptr [buf], static_cast(x)); push(4); } else { assert(1); } push((int)(msg ? msg : x.toString())); // msg push((int)buf); // p call((void*)printHex); add(esp, 12); pop(edx); pop(ecx); pop(eax); } /* x = expand(reg) << shift input reg, tbl output : x destroy : reg */ void expand(const Xmm& x, const Reg32& reg, const Reg32& tbl, const Xmm& tx, const Reg32& tr, int shift) { const uint32 m10 = GetMask(10); const uint32 m11 = GetMask(11); switch (shift) { case 0: mov(tr, reg); and(tr, m11); movd(x, ptr [tbl + tr * 4]); mov(tr, reg); shr(tr, 11); and(tr, m11); movd(tx, ptr [tbl + tr * 4]); psllq(tx, 33); shr(reg, 22); orps(x, tx); movd(tx, ptr [tbl + reg * 4]); psllq(tx, 2); movlhps(x, tx); break; case 1: mov(tr, reg); and(tr, m11); movd(x, ptr [tbl + tr * 4]); mov(tr, reg); shr(tr, 11); and(tr, m10); paddd(x, x); movd(tx, ptr [tbl + tr * 4]); psllq(tx, 34); shr(reg, 21); orps(x, tx); movd(tx, ptr [tbl + reg * 4]); movlhps(x, tx); break; case 2: mov(tr, reg); and(tr, m10); movd(x, ptr [tbl + tr * 4]); psllq(x, 2); mov(tr, reg); shr(tr, 10); and(tr, m11); movd(tx, ptr [tbl + tr * 4]); psllq(tx, 32); orps(x, tx); shr(reg, 21); movd(tx, ptr [tbl + reg * 4]); paddd(tx, tx); movlhps(x, tx); break; } } /* x = t >> bit */ void shr128(const Xmm& x, const Xmm& t, int bit) { movaps(t, x); psrldq(t, 8); psrlq(x, bit); psllq(t, 64 - bit); orps(x, t); } /* x = t << bit */ void shl128(const Xmm& x, const Xmm& t, int bit) { movaps(t, x); pslldq(t, 8); psllq(x, bit); psrlq(t, 64 - bit); orps(x, t); } /* input : eax(xTbl) [eax + pos * 16] = L; [eax + pos * 16 + 16] = H; @note for only makePrevMulTbl */ void setTbl(int pos, const Xmm& L, const Xmm& H) { movaps(ptr [eax + pos * 32], L); movaps(ptr [eax + pos * 32 + 16], H); } /* input : eax(xTbl), [H:L], hint = L | H destroy : xm0, xm1, xm2 xTbl[d] = xTbl[a] + [H:L] xTbl[getNegIdx(d)] = -xTbl[d] @note for only makePrevMulTbl */ void add0(int d, int a, const Xmm& L, const Xmm& H, const Xmm& t0, const Xmm& t1, const Xmm& t2, const Xmm& hint) { movaps(t0, ptr [eax + a * 32]); movaps(t1, ptr [eax + a * 32 + 16]); add16hint(t0, t1, L, H, t2, hint); setTbl(d, t0, t1); // + setTbl(getNegIdx(d), t1, t0); // - } /* input : eax(xTbl) xTbl[d] = -xTbl[a]; @note for only makePrevMulTbl */ void neg0(int d, int a) { movaps(xm0, ptr [eax + a * 32]); movaps(xm1, ptr [eax + a * 32 + 16]); movaps(ptr [eax + d * 32], xm1); movaps(ptr [eax + d * 32 + 16], xm0); } /* input : [aH:aL], [bH:bL] destroy : t0, t1 output : [aH:aL] += [bH:bL] */ void add16(const Xmm& aL, const Xmm& aH, const Xmm& bL, const Xmm& bH, const Xmm& t0, const Xmm& t1) { movaps(t0, aL); movaps(t1, bL); orps(t0, aH); orps(t1, bH); andps(t0, t1); orps(aL, bL); orps(aH, bH); xorps(aL, t0); xorps(aH, t0); } /* input : [aH:aL], [bH:bL] destroy : t, bL output : [aH:aL] += [bH:bL] */ void add16D(const Xmm& aL, const Xmm& aH, const Xmm& bL, const Xmm& bH, const Xmm& t) { movaps(t, aL); orps(aL, bL); orps(t, aH); orps(bL, bH); orps(aH, bH); andps(t, bL); xorps(aL, t); xorps(aH, t); } /* input : [dH:dL], [sH:sL], h = bL | bH destroy : t0, t1 output : [dH:dL] += [sH:sL] */ void add16hint(const Xmm& aL, const Xmm& aH, const Xmm& bL, const Xmm& bH, const Xmm& t, const Xmm& hint) { movaps(t, aL); orps(t, aH); andps(t, hint); orps(aL, bL); orps(aH, bH); xorps(aL, t); xorps(aH, t); } /* input : [aH:aL], [bH:bL] destroy : t0, t1 output : [aH:aL] -= [bH:bL] */ void sub16(const Xmm& aL, const Xmm& aH, const Xmm& bL, const Xmm& bH, const Xmm& t0, const Xmm& t1) { add16(aL, aH, bH, bL, t0, t1); } /* input : [H:L] output : [H:L] <<= byte cf. palignr(xm1, xm0, 15); [xm1:xm0] = [AABBCCDD6677889922334455EEFF0011:DDEEFF0099AABBCC5566778811223344] [xm1:xm0] = [BBCCDD6677889922334455EEFF0011DD:DDEEFF0099AABBCC5566778811223344] palignr(A, B, 1); [AAAAAAAAAAAAAAAA] -> [ABBBBBBBBBBBBBBBB] */ void shlByte(const Xmm& L, const Xmm& H, int nByte) { assert(nByte < 16); palignr(H, L, 16 - nByte); pslldq(L, nByte); } /* input : [H:L] destroy : t [H:L] <<= bit */ void shlBit(const Xmm& L, const Xmm& H, const Xmm& t, int nBit) { assert(nBit < 8); movaps(t, H); palignr(t, L, 15); psllq(H, nBit); psrlq(t, 8 - nBit); orps(H, t); movaps(t, L); pslldq(t, 1); psllq(L, nBit); psrlq(t, 8 - nBit); orps(L, t); } /* input : [H:L] destroy : t [H:L] >>= bit */ void shrBit(const Xmm& L, const Xmm& H, const Xmm& t, int nBit) { const int r = nBit / 8; const int q = nBit & 7; movaps(t, L); palignr(L, H, r); palignr(t, H, r + 1); psrlq(L, q); psllq(t, 8 - q); orps(L, t); shrBit(H, t, nBit); } /* input : x destroy : t x >>= bit */ void shrBit(const Xmm& x, const Xmm& t, int nBit) { const int r = nBit / 8; const int q = nBit & 7; psrldq(x, r); movaps(t, x); psrlq(x, q); psrldq(t, 1); psllq(t, 8 - q); orps(x, t); } // [193..0] = [96..0] + [193..97] + rol_12([193..97]) + ([193..182] << 12) /* input : [H:L] output out = H = ([H:L] >> 97), L : not change palignr(xm1, xm0, 15); [xm1:xm0] = [AABBCCDD6677889922334455EEFF0011:DDEEFF0099AABBCC5566778811223344] [xm1:xm0] = [BBCCDD6677889922334455EEFF0011DD:DDEEFF0099AABBCC5566778811223344] */ void shr97(const Xmm& L, const Xmm& H, const Xmm& t) { movaps(t, H); // out = [H3:H2:H1:H0] L = [L3:L2:L1:L0] => [H2:H1:H0:L3] palignr(t, L, 12); palignr(H, L, 13); psrlq(t, 1); psllq(H, 8 - 1); orps(H, t); } /* input : x[96..0] output : x[84..12:96..85] */ void rol12(const Xmm& x, const Xmm& t) { movaps(t, x); pslldq(x, 1); // x <<= 8 pslldq(t, 2); // t <<= 16 psllq(x, 4); psrlq(t, 4); orps(x, t); // t <<= 12 xorps(t, t); movhlps(t, x); psrlq(t, 21 + 12); // t = x[..85] orps(x, t); } /* in : x = [L1:L0], [H1:H0] out : x mod (x^97 + x^12 - 1) */ void reduce(const Xmm& L0, const Xmm& L1, const Xmm& H0, const Xmm& H1, const Xmm& t0, const Xmm& t1, const Xmm& t2) { static const uint32 MIE_ALIGN(16) mask0[] = { 0xfff, 0, 0, 0 }; // QQQ static const uint32 MIE_ALIGN(16) mask1[] = { 0xffffffff, 0xffffffff, 0xffffffff, 1, 0 }; movaps(t2, ptr [mask0]); shr97(L0, L1, t0); // L1 = [L1:L0] >> 97 shr97(H0, H1, t0); // H1 = [H1:H0] >> 97 add16(L0, H0, L1, H1, t0, t1); rol12(L1, t0); rol12(H1, t0); sub16(L0, H0, L1, H1, t0, t1); andps(L1, t2); andps(H1, t2); movaps(t2, ptr [mask1]); psllq(L1, 12); psllq(H1, 12); add16D(L0, H0, L1, H1, t0); andps(L0, t2); andps(H0, t2); } /* input : x destroy : t1, t2 output : out, x(if pos == 0) bit = 2のとき out = x + ((x >> 1) & 1); bit = 3のとき in[pos + 2:pos + 1:pos]=[b2 b1 b0]を9b2 + 3b1 + b0に変換する */ void cvtBinToTri(const Xmm& out, const Xmm& x, const Xmm& t0, const Xmm& t1, const Xmm& mask0, const Xmm& mask1, int bit, int pos) { if (bit == 1) { movaps(out, x); if (pos > 0) { psrlq(out, pos); } andps(out, mask0); return; } const Xmm *t = &x; if (pos > 0) { movaps(t0, x); psrlq(t0, pos); t = &t0; } switch (bit) { case 2: andps(*t, mask1); movaps(out, *t); psrlq(*t, 1); andps(*t, mask0); paddd(out, *t); break; case 3: /* x = 4a + 2b + c y = 9a + 3b + c = 8a + a + (2b + c) + b */ movaps(out, *t); psrlq(out, 2); andps(out, mask0); // a movaps(t1, out); psllq(t1, 3); // 8a paddd(out, t1); // 9a movaps(t1, *t); andps(t1, mask1); // 2b + c paddd(out, t1); // 9a + 2b + c psrlq(t1, 1); andps(t1, mask0); // b paddd(out, t1); break; case 4: /* x = 8a + 4b + 2c + d y = 27a + 9b + 3c + d = (3a + b) * 8 + (3a + b) + (2c + d) + c */ movaps(out, *t); psrlq(out, 2); andps(out, mask1); // 2a + b movaps(t1, out); psrlq(t1, 1); andps(t1, mask0); // a paddd(out, t1); // 3a + b movaps(t1, out); psllq(t1, 3); // (3a + b) * 8 paddd(out, t1); // 27a + 9b movaps(t1, *t); andps(t1, mask1); // 2c + d paddd(out, t1); // 27a + 9b + 2c + d psrlq(t1, 1); andps(t1, mask0); // c paddd(out, t1); // 27a + 9b + 3c + d break; default: fprintf(stderr, "bad bit=%d\n", bit); assert(0); } #if 0 xorps(out, out); for (int j = 0; j < bit; j++) { movaps(t0, x); int s = pos + bit - 1 - j; if (s) { psrlq(t0, s); } andps(t0, mask0); paddd(out, t0); if (j < bit - 1) { movaps(t0, out); paddd(out, t0); paddd(out, t0); } } #endif } /* setIdx(&idxTbl[0][0], b, topBit); for (int s = 1, i = topBit - bit; i >= 0; i -= bit, s++) { setIdx(&idxTbl[s][0], b, i); } */ void makeIdxTbl(int bit, int w, int firstShiftBit, int topBit, const Reg32& x, const Reg32& idxTbl) { const int n = w == 8 ? 0 : w == 16 ? 1 : 2; MIE_ALIGN(16) static const uint32 maskTbl[3][2][4] = { { // w = 8 { 0x01010101, 0x01010101, 0x01010101, 0x01010101 }, { 0x03030303, 0x03030303, 0x03030303, 0x03030303 }, }, { // w = 16 { 0x00010001, 0x00010001, 0x00010001, 0x00010001 }, { 0x00030003, 0x00030003, 0x00030003, 0x00030003 }, }, { // w = 32 { 0x00000001, 0x00000001, 0x00000001, 0x00000001 }, { 0x00000003, 0x00000003, 0x00000003, 0x00000003 }, } }; movaps(xm6, ptr [maskTbl[n][0]]); movaps(xm7, ptr [maskTbl[n][1]]); movaps(xm0, ptr [x]); // L movaps(xm1, ptr [x + 16]); // H for (int s = 0, i = topBit; i >= 0; i -= bit, s++) { int b = (s == 0) ? firstShiftBit : bit; cvtBinToTri(xm2, xm0, xm4, xm5, xm6, xm7, b, i); cvtBinToTri(xm3, xm1, xm4, xm5, xm6, xm7, b, i); paddd(xm2, xm3); paddd(xm2, xm3); // L + H * 2 movaps(ptr [idxTbl + s * 16], xm2); } } /* use all xmm0, ..., xm7, stack[16 * 6], t1, t2 (in + outOffset)[H:L] = cubeRoot(in[H:L]) */ void cubeRootCore(const Reg32& in, const int outOffset, const Reg32& stack, const Reg32& t1, const Reg32& t2, const Reg32& t3) { movaps(xm0, ptr [in]); // L movaps(xm1, ptr [in + 16]); // H compress(xm2, xm3, xm0, xm1, xm4, xm5, xm6, xm7, t2); movaps(ptr [stack], xm2); movaps(ptr [stack + 16], xm3); psrld(xm0, 1); psrld(xm1, 1); compress(xm2, xm3, xm0, xm1, xm4, xm5, xm6, xm7, t2); movaps(ptr [stack + 32], xm2); movaps(ptr [stack + 48], xm3); psrld(xm0, 1); psrld(xm1, 1); compress(xm2, xm3, xm0, xm1, xm4, xm5, xm6, xm7, t2); movaps(ptr [stack + 64], xm2); movaps(ptr [stack + 80], xm3); /* +0 +1 +2 +3 +4 +5 +6 +7 L0 >> 0, L1 >> 0, L2 >> 0, x, H0 >> 0, H1 >> 0, H2 >> 0, x, L0 >> 1, L1 >> 1, L2 >> 1, x, H0 >> 1, H1 >> 1, H2 >> 1, x, L0 >> 2, L1 >> 2, L2 >> 2, x, H0 >> 2, H1 >> 2, H2 >> 2, x, */ mov(t2, ptr [in + 12]); and(t2, 1); // x.L_[3] & 1 movd(xm0, t2); mov(t2, ptr [in + 16 + 12]); and(t2, 1); movd(xm1, t2); // x.H_[3] & 1 pslldq(xm0, 4); pslldq(xm1, 4); movss(xm2, ptr [stack + 0 * 4]); // L0 movss(xm3, ptr [stack + 4 * 4]); orps(xm0, xm2); orps(xm1, xm3); movss(xm2, ptr [stack + 9 * 4]); // L1 >> 1 movss(xm3, ptr [stack + 13 * 4]); psllq(xm2, 11); psllq(xm3, 11); orps(xm0, xm2); orps(xm1, xm3); movss(xm2, ptr [stack + 18 * 4]); // L2 >> 2 movss(xm3, ptr [stack + 22 * 4]); psllq(xm2, 22); psllq(xm3, 22); orps(xm0, xm2); // a orps(xm1, xm3); movss(xm2, ptr [stack + 8 * 4]); // L0 >> 1 movss(xm3, ptr [stack + 12 * 4]); movss(xm4, ptr [stack + 17 * 4]); // L1 >> 2 movss(xm5, ptr [stack + 21 * 4]); psllq(xm4, 11); psllq(xm5, 11); orps(xm2, xm4); orps(xm3, xm5); movss(xm4, ptr [stack + 2 * 4]); // L2 movss(xm5, ptr [stack + 6 * 4]); psllq(xm4, 21); psllq(xm5, 21); orps(xm2, xm4); // b orps(xm3, xm5); movss(xm4, ptr [stack + 16 * 4]); // L0 >> 2 movss(xm5, ptr [stack + 20 * 4]); movss(xm6, ptr [stack + 1 * 4]); // L1 movss(xm7, ptr [stack + 5 * 4]); psllq(xm6, 10); psllq(xm7, 10); orps(xm4, xm6); orps(xm5, xm7); movss(xm6, ptr [stack + 10 * 4]); // L2 > 1 movss(xm7, ptr [stack + 14 * 4]); psllq(xm6, 21); psllq(xm7, 21); orps(xm4, xm6); orps(xm5, xm7); movaps(ptr [stack], xm4); movaps(ptr [stack + 16], xm5); mov(t1, (int)_10cubeRootTbl); addLow(xm0, xm1, xm2, xm3, xm4, xm5, xm6, t1, t2, t3); movaps(xm2, ptr [stack]); movaps(xm3, ptr [stack + 16]); mov(t1, (int)_100cubeRootTbl); addLow(xm0, xm1, xm2, xm3, xm4, xm5, xm6, t1, t2, t3); movaps(ptr [in + outOffset], xm0); movaps(ptr [in + outOffset + 16], xm1); } /* use all xmm0, ..., xm7 input : in ; address of [H:L], tbl = MIE_ETAT::cubeTbl output : (in + offset)[H:L] = cube(in[H:L]) 96 : [95..64] : [63..32] : [31..0] --- 32 : [96..65] : [64..33] : [31..0] ; a -- : [92..61] : [60..33] << 4 : [96..89] << 4 | [92..89] ; b -- : [88..61] << 4 | [64..61] : [60..33] << 4 : [96..89] << 4 | [96..93] ; c */ void cubeCore(const Reg32& in, int offset, const Reg32& tbl, const Reg32& t0, const Reg32& t1) { movaps(xm0, ptr [in]); // L movaps(xm1, ptr [in + 16]); // H MIE_ALIGN(16) static const uint32 m[][4] = { { 0x00000000, 0x00000000, 0x00000ff0, 0x00000000 }, // 0 { 0xfffffff0, 0x0000000f, 0x00000000, 0x00000000 }, // 2 { 0x00000000, 0x00000000, 0x00000000, 0x00000001 }, // 3 }; movaps(xm6, xm0); movaps(xm7, xm1); shr128(xm0, xm4, 33); // [-:-:96..65:64..33] shr128(xm1, xm4, 33); movlhps(xm0, xm6); // [*:31..0:96..65:64..33] movlhps(xm1, xm7); xorps(xm4, xm4); xorps(xm5, xm5); movss(xm4, xm0); // [-:-:-:64..33] movss(xm5, xm1); psllq(xm4, 4); // [-:-:_64..61:60..33____] psllq(xm5, 4); psrlq(xm6, 21); // [-:_96..89????:*:*] psrlq(xm7, 21); andps(xm6, ptr [m[0]]); // -:_96..89____:-:-] andps(xm7, ptr [m[0]]); orps(xm4, xm6); // [-:_96..89____:__64..61:60..33____] orps(xm5, xm7); movaps(xm6, xm0); movaps(xm7, xm1); psrlq(xm6, 28); // [*:*:*96..93:92..65????] psrlq(xm7, 28); andps(xm6, ptr [m[1]]); // [-:-:__96..93:92..65____] andps(xm7, ptr [m[1]]); pslldq(xm6, 4); pslldq(xm7, 4); orps(xm6, xm5); // [96..65] - ([96..89] + ([64..33] << 4)) orps(xm7, xm4); movaps(xm2, xm0); movaps(xm3, xm1); add16D(xm0, xm1, xm6, xm7, xm4); psrlq(xm2, 24); // [*:*:92..89:88..61????] psrlq(xm3, 24); andps(xm2, ptr [m[1]]); andps(xm3, ptr [m[1]]); pslldq(xm2, 4); // [-:_92..89:88..61____:-] pslldq(xm3, 4); add16D(xm0, xm1, xm2, xm3, xm4); movd(t0, xm0); expand(xm2, t0, tbl, xm6, t1, 2); movd(t0, xm1); expand(xm3, t0, tbl, xm6, t1, 2); psrldq(xm0, 4); psrldq(xm1, 4); movd(t0, xm0); expand(xm4, t0, tbl, xm6, t1, 1); movd(t0, xm1); expand(xm5, t0, tbl, xm6, t1, 1); orps(xm2, xm4); orps(xm3, xm5); psrldq(xm0, 4); psrldq(xm1, 4); movd(t0, xm0); expand(xm0, t0, tbl, xm6, t1, 0); movd(t0, xm1); expand(xm1, t0, tbl, xm6, t1, 0); orps(xm0, xm2); orps(xm1, xm3); movaps(xm2, ptr [in]); movaps(xm3, ptr [in + 16]); pslldq(xm2, 8); pslldq(xm3, 8); andps(xm2, ptr [m[2]]); andps(xm3, ptr [m[2]]); orps(xm0, xm2); orps(xm1, xm3); movaps(ptr [in + offset], xm0); movaps(ptr [in + offset + 16], xm1); } }; class OptimizedMulLatter { enum { BIT_KIND_NUM = 3, /* 2, 3, 4 */ W_KIND_NUM = 3 /* 8, 16, 32 */ }; GeneratorForF3_97 codeTbl[BIT_KIND_NUM][W_KIND_NUM]; F3_97::mulLatterFunc funcTbl[BIT_KIND_NUM][W_KIND_NUM]; public: OptimizedMulLatter() { static const int bitTbl[] = { 2, 3, 4 }; static const int wTbl[] = { 8, 16, 32 }; for (size_t i = 0; i < NUM_OF_ARRAY(bitTbl); i++) { int bit = bitTbl[i]; for (size_t j = 0; j < NUM_OF_ARRAY(wTbl); j++) { int w = wTbl[j]; codeTbl[i][j].createOptimizedMulLatter(bit, w); funcTbl[i][j] = (F3_97::mulLatterFunc)(codeTbl[i][j].getCode()); } } } template inline void (*get() const)(F3_97& out, const F3_97 *xTbl, const F3_97& b) { return funcTbl[bit - 2][w == 32 ? 2 : w == 16 ? 1 : 0]; } }; class OptimizedMakePrevMulTbl { enum { BIT_KIND_NUM = 3 /* 2, 3, 4 */ }; GeneratorForF3_97 codeTbl[BIT_KIND_NUM]; F3_97::makePrevMulTblFunc funcTbl[BIT_KIND_NUM]; public: OptimizedMakePrevMulTbl() { static const int bitTbl[] = { 2, 3, 4 }; for (size_t i = 0; i < NUM_OF_ARRAY(bitTbl); i++) { int bit = bitTbl[i]; codeTbl[i].createOptimizedMakePrevMulTbl(bit); funcTbl[i] = (F3_97::makePrevMulTblFunc)(codeTbl[i].getCode()); } } template inline void (*get() const)(F3_97 *xTbl, const F3_97& x) { return funcTbl[bit - 2]; } }; class OptimizedCube { GeneratorForF3_97 code; F3_97::cubeFunc func; public: OptimizedCube() { code.createOptimizedCube(); func = (F3_97::cubeFunc)code.getCode(); } inline void (*get() const)(F3_97& x) { return func; } }; class OptimizedRawShl { enum { TYPE_NUM = 3 /* 0:any, 1:fix, 2:fix */ }; GeneratorForF3_97 codeTbl[TYPE_NUM]; F3_97::rawShlFunc func[TYPE_NUM]; public: OptimizedRawShl() { for (int type = 0; type < TYPE_NUM; type++) { codeTbl[type].createOptimizedRawShl(type); func[type] = (F3_97::rawShlFunc)codeTbl[type].getCode(); } } inline void (*get() const)(F3_97& x, int bit) { return func[0]; } #if 0 inline void (*get1() const)(uint32 *) { return (uint32 (*)(uint32*))func[1]; } inline void (*get2() const)(uint32 *) { return (uint32 (*)(uint32*))func[2]; } #endif }; class OptimizedDegree { GeneratorForF3_97 codeTbl; F3_97::degreeFunc func; public: OptimizedDegree() { codeTbl.createOptimizedDegree(); func = (F3_97::degreeFunc)codeTbl.getCode(); } inline int (*get() const)(const F3_97&) { return func; } }; class OptimizedCubeRoot { GeneratorForF3_97 codeTbl; F3_97::cubeRootFunc func; public: OptimizedCubeRoot() { codeTbl.createOptimizedCubeRoot(); func = (F3_97::cubeRootFunc)codeTbl.getCode(); } inline void (*get() const)(F3_97&) { return func; } }; class OptimizedCubeRootSeq { GeneratorForF3_97 codeTbl; F3_97::cubeRootSeqFunc func; public: OptimizedCubeRootSeq() { codeTbl.createOptimizedCubeRootSeq(); func = (F3_97::cubeRootSeqFunc)codeTbl.getCode(); } inline void (*get() const)(F3_97*) { return func; } }; class OptimizedCubeSeq { GeneratorForF3_97 codeTbl; F3_97::cubeSeqFunc func; public: OptimizedCubeSeq() { codeTbl.createOptimizedCubeSeq(); func = (F3_97::cubeSeqFunc)codeTbl.getCode(); } inline void (*get() const)(F3_97*) { return func; } }; class Cpu : public Xbyak::CodeGenerator { public: enum { NONE = 0, MMX = 1 << 0, EMMX = 1 << 1, SSE = 1 << 2, SSE2 = 1 << 3, SSE3 = 1 << 4, SSSE3 = 1 << 5 }; uint32 type_; Cpu() : type_(NONE) { create(); uint32 data[4]; get()(0, data); static const char intel[] = "ineI"; if (data[3] == *(const uint32*)intel) { get()(1, data); uint32 r = data[2]; if (r & (1 << 9)) { type_ |= SSSE3; } } } bool has(uint32 type) { return (type & type_) != 0; } private: void create() { push(ebx); push(esi); const int _P = 4 * 2; mov(eax, ptr [esp + _P + 4]); Xbyak::CodeGenerator::cpuid(); mov(esi, ptr [esp + _P + 8]); mov(ptr [esi], eax); mov(ptr [esi + 4], ebx); mov(ptr [esi + 8], ecx); mov(ptr [esi + 12], edx); pop(esi); pop(ebx); ret(); } void (*get() const)(uint32 eaxIn, uint32 data[4]) { return (void (*)(uint32 eaxIn, uint32 data[4]))getCode(); } }; #endif