#include #include #include #include #if defined(_WIN32) && (_MSC_VER <= 1400) #define NO_SSE #endif #ifdef NO_SSE #define MIE_ALIGN(x) float LogC(float x) { return log(x); } void Log(float out[4], const float in[4]) { out[0] = log(in[0]); out[1] = log(in[1]); out[2] = log(in[2]); out[3] = log(in[3]); } #else #ifdef _WIN32 #include #define MIE_ALIGN(x) __declspec(align(x)) #else #include #define MIE_ALIGN(x) __attribute__((aligned(x))) #endif float LogC(float x) { const float SQRT2 = sqrt(2.0F); const float LOG_E_2 = log(2.0F); float a, z, zz; unsigned int *p; int b; const float A = 1.0 + 3.104605e-7; /* A*2 = 2.0000006209 */ const float B = 1.0/3 - 9.440747e-5; /* B*2 = 0.6664778517 */ const float C = 1.0/5 + 6.987293e-3; /* C*2 = 0.4139745860 */ p = (unsigned int *)&x; b = (*p >> 23) - 127; /* suppose x_org > 0 */ *p= ( *p & 0x007FFFFF ) | 0x3F800000; a = *(float *)p; z = (a - SQRT2) / (a + SQRT2); zz = z * z; z = (b+0.5) * LOG_E_2 + z * ((A*2) + zz * ((B*2) + zz * (C*2))); return z; } void Log(float out[4], const float in[4]) { static const MIE_ALIGN(16) float fA2[4] = { 2.0000006209f, 2.0000006209f, 2.0000006209f, 2.0000006209f }; static const MIE_ALIGN(16) float fB2[4] = { 0.6664778517f, 0.6664778517f, 0.6664778517f, 0.6664778517f }; static const MIE_ALIGN(16) float fC2[4] = { 0.4139745860f, 0.4139745860f, 0.4139745860f, 0.4139745860f }; static const MIE_ALIGN(16) float fsqrt2[4] = { 1.4142135623f, 1.4142135623f, 1.4142135623f, 1.4142135623f }; static const MIE_ALIGN(16) int i007FFFFF[4] = { 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF }; static const MIE_ALIGN(16) int i3F800000[4] = { 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 }; static const MIE_ALIGN(16) int i127[4] = { 127, 127, 127, 127 }; static const MIE_ALIGN(16) float f0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; // static const MIE_ALIGN(16) float fm0p5[4] = { -0.5f, -0.5f, -0.5f, -0.5f }; // static const MIE_ALIGN(16) float f1p5[4] = { 1.5f, 1.5f, 1.5f, 1.5f }; static const MIE_ALIGN(16) float flog2[4] = { 0.6931471805f, 0.6931471805f, 0.6931471805f, 0.6931471805f }; static const MIE_ALIGN(16) float f2[4] = { 2, 2, 2, 2 }; __m128i x = *(const __m128i*)in; __m128i y = _mm_and_si128(x, *(const __m128i*)i007FFFFF); y = _mm_or_si128(y, *(const __m128i*)i3F800000); x = _mm_srli_epi32(x, 23); x = _mm_sub_epi32(x, *(const __m128i*)i127); __m128 a = _mm_castsi128_ps(y); __m128 b = _mm_cvtepi32_ps(x); __m128 sqrt2 = *(const __m128*)fsqrt2; #if 0 __m128 r0 = _mm_add_ps(a, sqrt2); __m128 r1 = _mm_rcp_ps(r0); r0 = _mm_mul_ps(_mm_mul_ps(r0, r1), r1); r1 = _mm_add_ps(r1, r1); r0 = _mm_sub_ps(r1, r0); __m128 z = _mm_mul_ps(_mm_sub_ps(a, sqrt2), r0); #else __m128 z = _mm_div_ps(_mm_sub_ps(a, sqrt2), _mm_add_ps(a, sqrt2)); #endif __m128 zz = _mm_mul_ps(z, z); __m128 ret = _mm_mul_ps(zz, *(const __m128*)fC2); ret = _mm_add_ps(ret, *(const __m128*)fB2); ret = _mm_mul_ps(ret, zz); ret = _mm_add_ps(ret, *(const __m128*)fA2); ret = _mm_mul_ps(ret, z); b = _mm_add_ps(b, *(const __m128*)f0p5); b = _mm_mul_ps(b, *(const __m128*)flog2); ret = _mm_add_ps(ret, b); *(__m128*)out = ret; } #endif int main() { #if 1 const float e = 1e-6f; double sum = 0; double max = 0; int count = 0; for (float x = 1; x <= 2; x += e) { float y = log(x); MIE_ALIGN(16) float in[4] = { x, x, x, x }; MIE_ALIGN(16) float out[4]; Log(out, in); double diff = fabs(y - out[0]); max = std::max(max, diff); sum += diff; count++; } printf("max=%e, ave=%e\n", max, sum / count); { sum = 0; boost::timer t; for (float x = e; x <= 10; x += e * 4) { MIE_ALIGN(16) float in[4] = { x, x + e, x + e * 2, x + e * 3 }; MIE_ALIGN(16) float out[4]; out[0] = log(in[0]); out[1] = log(in[1]); out[2] = log(in[2]); out[3] = log(in[3]); sum += (out[0] + out[1] + out[2] + out[3]) * e; } printf("sum=%f, time=%f\n", sum, t.elapsed()); } { sum = 0; boost::timer t; for (float x = e; x <= 10; x += e * 4) { MIE_ALIGN(16) float in[4] = { x, x + e, x + e * 2, x + e * 3 }; MIE_ALIGN(16) float out[4]; Log(out, in); sum += (out[0] + out[1] + out[2] + out[3]) * e; } printf("sum=%f, time=%f\n", sum, t.elapsed()); } #else for (float x = 0.1; x < 5; x += 0.3) { float y = log(x); MIE_ALIGN(16) float in[4] = { x, x, x, x }; MIE_ALIGN(16) float out[4]; Log(out, in); printf("x=%f, y=%f, z=%f, my=%f\n", x, y, LogC(x), out[0]); } #endif }