From b28908860d2001f1c66627e0ec024a01e5e9af7c Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 16 Jul 2013 12:52:18 +0200 Subject: [PATCH] Optimize FT_MulFix for x86_64 GCC builds. This patch provides an optimized `FT_MulFix' implementation for x86_64 machines when FreeType is built with GCC, or compatible compilers like Clang. Example: bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf Before: Load 4.863 us/op Load_Advances (Normal) 4.816 us/op Load_Advances (Fast) 0.028 us/op Render 2.753 us/op Get_Glyph 0.463 us/op Get_CBox 0.077 us/op Get_Char_Index 0.023 us/op Iterate CMap 13.898 us/op New_Face 12.368 us/op Embolden 0.028 us/op Get_BBox 0.302 us/op After: Load 4.617 us/op Load_Advances (Normal) 4.645 us/op Load_Advances (Fast) 0.027 us/op Render 2.789 us/op Get_Glyph 0.460 us/op Get_CBox 0.077 us/op Get_Char_Index 0.024 us/op Iterate CMap 13.403 us/op New_Face 12.278 us/op Embolden 0.028 us/op Get_BBox 0.301 us/op * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h (FT_MulFix_x86_64): New function. --- ChangeLog | 42 +++++++++++++++++++++ builds/unix/ftconfig.in | 59 +++++++++++++++++++++++++++++ include/freetype/config/ftconfig.h | 60 ++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+) diff --git a/ChangeLog b/ChangeLog index 41c0d0dea..738a1ad3c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,45 @@ +2013-07-16 David Turner + + Optimize FT_MulFix for x86_64 GCC builds. + + This patch provides an optimized `FT_MulFix' implementation for + x86_64 machines when FreeType is built with GCC, or compatible + compilers like Clang. + + Example: + bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf + + Before: + + Load 4.863 us/op + Load_Advances (Normal) 4.816 us/op + Load_Advances (Fast) 0.028 us/op + Render 2.753 us/op + Get_Glyph 0.463 us/op + Get_CBox 0.077 us/op + Get_Char_Index 0.023 us/op + Iterate CMap 13.898 us/op + New_Face 12.368 us/op + Embolden 0.028 us/op + Get_BBox 0.302 us/op + + After: + + Load 4.617 us/op + Load_Advances (Normal) 4.645 us/op + Load_Advances (Fast) 0.027 us/op + Render 2.789 us/op + Get_Glyph 0.460 us/op + Get_CBox 0.077 us/op + Get_Char_Index 0.024 us/op + Iterate CMap 13.403 us/op + New_Face 12.278 us/op + Embolden 0.028 us/op + Get_BBox 0.301 us/op + + * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h + (FT_MulFix_x86_64): New function. + 2013-07-16 David Turner Speed up ARMv7 support. diff --git a/builds/unix/ftconfig.in b/builds/unix/ftconfig.in index c82fe5d6e..c373b9f5f 100644 --- a/builds/unix/ftconfig.in +++ b/builds/unix/ftconfig.in @@ -366,6 +366,7 @@ FT_BEGIN_HEADER /* These must be defined `static __inline__' with GCC. */ #if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */ + #define FT_MULFIX_ASSEMBLER FT_MulFix_arm /* documentation is in freetype.h */ @@ -428,7 +429,9 @@ FT_BEGIN_HEADER /* ( __thumb2__ || !__thumb__ ) && */ /* !( __CC_ARM || __ARMCC__ ) */ + #if defined( __i386__ ) + #define FT_MULFIX_ASSEMBLER FT_MulFix_i386 /* documentation is in freetype.h */ @@ -497,6 +500,62 @@ FT_BEGIN_HEADER #endif /* _MSC_VER */ + +#if defined( __GNUC__ ) && defined( __x86_64__ ) + +#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64 + + static __inline__ FT_Int32 + FT_MulFix_x86_64( FT_Int32 a, + FT_Int32 b ) + { + /* Temporarily disable the warning that C90 doesn't support */ + /* `long long'. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wlong-long" + +#if 1 + /* Technically not an assembly fragment, but GCC does a really good */ + /* job at inlining it and generating good machine code for it. */ + long long ret, tmp; + + + ret = (long long)a * b; + tmp = ret >> 63; + ret += 0x8000 + tmp; + + return (FT_Int32)( ret >> 16 ); +#else + + /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */ + /* code from the lines below. The main issue is that `wide_a' is not */ + /* properly initialized by sign-extending `a'. Instead, the generated */ + /* machine code assumes that the register that contains `a' on input */ + /* can be used directly as a 64-bit value, which is wrong most of the */ + /* time. */ + long long wide_a = (long long)a; + long long wide_b = (long long)b; + long long result; + + + __asm__ __volatile__ ( + "imul %2, %1\n" + "mov %1, %0\n" + "sar $63, %0\n" + "lea 0x8000(%1, %0), %0\n" + "sar $16, %0\n" + : "=&r"(result), "=&r"(wide_a) + : "r"(wide_b) + : "cc" ); + + return (FT_Int32)result; +#endif + +#pragma GCC diagnostic pop + } + +#endif /* __GNUC__ && __x86_64__ */ + #endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */ diff --git a/include/freetype/config/ftconfig.h b/include/freetype/config/ftconfig.h index 3349e298d..ab1e7a504 100644 --- a/include/freetype/config/ftconfig.h +++ b/include/freetype/config/ftconfig.h @@ -338,6 +338,7 @@ FT_BEGIN_HEADER /* These must be defined `static __inline__' with GCC. */ #if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */ + #define FT_MULFIX_ASSEMBLER FT_MulFix_arm /* documentation is in freetype.h */ @@ -370,6 +371,7 @@ FT_BEGIN_HEADER #if defined( __arm__ ) && \ ( !defined( __thumb__ ) || defined( __thumb2__ ) ) && \ !( defined( __CC_ARM ) || defined( __ARMCC__ ) ) + #define FT_MULFIX_ASSEMBLER FT_MulFix_arm /* documentation is in freetype.h */ @@ -399,7 +401,9 @@ FT_BEGIN_HEADER /* ( __thumb2__ || !__thumb__ ) && */ /* !( __CC_ARM || __ARMCC__ ) */ + #if defined( __i386__ ) + #define FT_MULFIX_ASSEMBLER FT_MulFix_i386 /* documentation is in freetype.h */ @@ -468,6 +472,62 @@ FT_BEGIN_HEADER #endif /* _MSC_VER */ + +#if defined( __GNUC__ ) && defined( __x86_64__ ) + +#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64 + + static __inline__ FT_Int32 + FT_MulFix_x86_64( FT_Int32 a, + FT_Int32 b ) + { + /* Temporarily disable the warning that C90 doesn't support */ + /* `long long'. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wlong-long" + +#if 1 + /* Technically not an assembly fragment, but GCC does a really good */ + /* job at inlining it and generating good machine code for it. */ + long long ret, tmp; + + + ret = (long long)a * b; + tmp = ret >> 63; + ret += 0x8000 + tmp; + + return (FT_Int32)( ret >> 16 ); +#else + + /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */ + /* code from the lines below. The main issue is that `wide_a' is not */ + /* properly initialized by sign-extending `a'. Instead, the generated */ + /* machine code assumes that the register that contains `a' on input */ + /* can be used directly as a 64-bit value, which is wrong most of the */ + /* time. */ + long long wide_a = (long long)a; + long long wide_b = (long long)b; + long long result; + + + __asm__ __volatile__ ( + "imul %2, %1\n" + "mov %1, %0\n" + "sar $63, %0\n" + "lea 0x8000(%1, %0), %0\n" + "sar $16, %0\n" + : "=&r"(result), "=&r"(wide_a) + : "r"(wide_b) + : "cc" ); + + return (FT_Int32)result; +#endif + +#pragma GCC diagnostic pop + } + +#endif /* __GNUC__ && __x86_64__ */ + #endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */