diff --git a/swgl/src/gl.cc b/swgl/src/gl.cc index d8e346c209..f2679e7ac2 100644 --- a/swgl/src/gl.cc +++ b/swgl/src/gl.cc @@ -901,7 +901,7 @@ void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { } void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { - I32 c = roundto((Float){b, g, r, a}, 255.49f); + I32 c = roundfast((Float){b, g, r, a}, 255.49f); ctx->blendcolor = CONVERT(c, U16).xyzwxyzw; } @@ -931,7 +931,7 @@ void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) { } void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { - I32 c = roundto((Float){b, g, r, a}, 255.49f); + I32 c = roundfast((Float){b, g, r, a}, 255.49f); ctx->clearcolor = bit_cast(CONVERT(c, U8)); } @@ -2257,7 +2257,7 @@ static ALWAYS_INLINE void discard_depth(uint16_t z, uint16_t* zbuf, } static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) { - ivec4 i = roundto(v, 255.49f); + ivec4 i = roundfast(v, 255.49f); HalfRGBA8 xz = packRGBA8(i.z, i.x); HalfRGBA8 yw = packRGBA8(i.y, i.w); HalfRGBA8 xy = zipLow(xz, yw); @@ -2268,7 +2268,7 @@ static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) { } static inline WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v) { - I32 i = roundto((Float){v.z, v.y, v.x, v.w}, 255.49f); + I32 i = roundfast((Float){v.z, v.y, v.x, v.w}, 255.49f); HalfRGBA8 c = packRGBA8(i, i); return combine(c, c); } @@ -2429,7 +2429,7 @@ static inline void commit_output(uint32_t* buf, int span) { } static inline WideR8 pack_pixels_R8(Float c) { - return packR8(roundto(c, 255.49f)); + return packR8(roundfast(c, 255.49f)); } static inline WideR8 pack_pixels_R8() { diff --git a/swgl/src/glsl.h b/swgl/src/glsl.h index 52528adb0b..d5e492f6ef 100644 --- a/swgl/src/glsl.h +++ b/swgl/src/glsl.h @@ -66,7 +66,7 @@ SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; } SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; } SI Float if_then_else(I32 c, float t, float e) { - return Float((c & I32(Float(t))) | (~c & I32(Float(e)))); + return bit_cast((c & bit_cast(Float(t))) | (~c & bit_cast(Float(e)))); } SI I32 if_then_else(I32 c, int32_t t, int32_t e) { @@ -74,7 +74,7 @@ SI I32 if_then_else(I32 c, int32_t t, int32_t e) { } SI Float if_then_else(I32 c, Float t, Float e) { - return Float((c & I32(t)) | (~c & I32(e))); + return bit_cast((c & bit_cast(t)) | (~c & bit_cast(e))); } SI Float if_then_else(int32_t c, Float t, Float e) { return c ? t : e; } @@ -489,7 +489,7 @@ Float abs(Float v) { #if USE_NEON return vabsq_f32(v); #else - return Float(I32(v) & I32(0.0f - v)); + return bit_cast(bit_cast(v) & bit_cast(0.0f - v)); #endif } @@ -521,9 +521,32 @@ Float ceil(Float v) { return roundtrip + if_then_else(roundtrip < v, Float(1), Float(0)); } -SI int32_t roundto(float v, float scale) { return int32_t(v * scale + 0.5f); } +// Round to nearest even +SI int32_t roundeven(float v, float scale) { +#if USE_SSE2 + return _mm_cvtss_si32(_mm_set_ss(v * scale)); +#else + return bit_cast(v * scale + float(0xC00000)) - 0x4B400000; +#endif +} + +SI I32 roundeven(Float v, Float scale) { +#if USE_SSE2 + return _mm_cvtps_epi32(v * scale); +#else + // Magic number implementation of round-to-nearest-even + // see http://stereopsis.com/sree/fpu2006.html + return bit_cast(v * scale + Float(0xC00000)) - 0x4B400000; +#endif +} + +// Round towards zero +SI int32_t roundzero(float v, float scale) { return int32_t(v * scale); } + +SI I32 roundzero(Float v, Float scale) { return cast(v * scale); } -SI I32 roundto(Float v, Float scale) { +// Round whichever direction is fastest for positive numbers +SI I32 roundfast(Float v, Float scale) { #if USE_SSE2 return _mm_cvtps_epi32(v * scale); #else @@ -566,8 +589,8 @@ Float approx_log2(Float x) { Float approx_pow2(Float x) { Float f = fract(x); return bit_cast( - roundto(1.0f * (1 << 23), x + 121.274057500f - 1.490129070f * f + - 27.728023300f / (4.84252568f - f))); + roundfast(1.0f * (1 << 23), x + 121.274057500f - 1.490129070f * f + + 27.728023300f / (4.84252568f - f))); } // From skia @@ -1629,9 +1652,9 @@ vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) { return vec4(x, y, z, w); } -SI ivec4 roundto(vec4 v, Float scale) { - return ivec4(roundto(v.x, scale), roundto(v.y, scale), roundto(v.z, scale), - roundto(v.w, scale)); +SI ivec4 roundfast(vec4 v, Float scale) { + return ivec4(roundfast(v.x, scale), roundfast(v.y, scale), + roundfast(v.z, scale), roundfast(v.w, scale)); } vec4 operator*(vec4_scalar a, Float b) { @@ -2652,7 +2675,7 @@ vec4 texture(sampler2D sampler, vec2 P) { return textureLinearR8(sampler, P); } } else { - ivec2 coord(roundto(P.x, sampler->width), roundto(P.y, sampler->height)); + ivec2 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height)); return texelFetch(sampler, coord, 0); } } @@ -2663,7 +2686,7 @@ vec4 texture(sampler2DRect sampler, vec2 P) { return textureLinearRGBA8(sampler, P * vec2_scalar{1.0f / sampler->width, 1.0f / sampler->height}); } else { - ivec2 coord(roundto(P.x, 1.0f), roundto(P.y, 1.0f)); + ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f)); return texelFetch(sampler, coord); } } @@ -2676,7 +2699,7 @@ vec4 texture(sampler2DArray sampler, vec3 P, Float layer) { vec4 texture(sampler2DArray sampler, vec3 P) { if (sampler->filter == TextureFilter::LINEAR) { I32 zoffset = - clampCoord(roundto(P.z, 1.0f), sampler->depth) * sampler->height_stride; + clampCoord(roundeven(P.z, 1.0f), sampler->depth) * sampler->height_stride; if (sampler->format == TextureFormat::RGBA32F) { return textureLinearRGBA32F(sampler, vec2(P.x, P.y), zoffset); } else if (sampler->format == TextureFormat::RGBA8) { @@ -2687,8 +2710,8 @@ vec4 texture(sampler2DArray sampler, vec3 P) { } } else { // just do nearest for now - ivec3 coord(roundto(P.x, sampler->width), roundto(P.y, sampler->height), - roundto(P.z, 1.0f)); + ivec3 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height), + roundeven(P.z, 1.0f)); return texelFetch(sampler, coord, 0); } }