Need faster Exponentiate

Richard · Post by **Richard** » May 27, 2019 6:06

Here is code that plots the error functions for FastPow() and for Raise().

'=======================================================================
'an approach which applies the (usually quite fast) runtime-Sqr-function (multiple times)
'the optional i_0_11 allows to adjust the amount of iterations - and thus the precision...
'with its default: i_0_11=6  the error is +-1% ... with 7 it is half that, a.s.o. up to 11
'-----------------------------------------------------------------------
' Written by OSchmidt, modified by Richard
'   WARNING: errors rise rapidly as x approaches zero
'-----------------------------------------------------------------------
Function FastPow( Byval x As Single, Byval n As Single) As Single
    Static As Single L( 0 To 11 ) = { 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625,_
    7.8125e-3, 3.90625e-3, 1.953125e-3, 9.765625e-4, 4.8828125e-4, 2.44140625e-4 }
    
    If x = 0 Then Return 0  ' special case for 0-values of x
    
    Dim As Long i, i_0_11 = 6   ' this parameter has been internalised for testing
    Dim As Single r = 1
    
    If n > 0 Then   ' handle positive exponents
        
        While n >= 1
            r *= x
            n -= 1
        Wend
        
        For i = 0 To i_0_11
            x = Sqr(x)
            If n >= L(i) Then r *= x: n -= L(i)
            If n <= L(i_0_11) Then Exit For ' early exit (n can't get any smaller)
        Next
        
    Else
        If n = 0 Then Return 1  ' special case, avoids n=0 crash
        r = 1 / FastPow( x, -n ) ' handle negative exponents
    End If
    
    Return r
End Function

'-----------------------------------------------------------------------
' Compare the single precision  x^b  with algorithm under test
'  reference  ref time   test     test time   speed ratio  Error bounds
'  single_pwr   69.8ns   FastPow11  154.7ns      221.67%    0.03125%
'  single_pwr   69.8ns   FastPow10  141.5ns      202.68%    0.0625%
'  single_pwr   69.7ns   FastPow9   127.0ns      182.23%    0.125%
'  single_pwr   69.7ns   FastPow8   112.3ns      161.07%    0.25%
'  single_pwr   69.7ns   FastPow7    99.8ns      143.11%    0.5%
'  single_pwr   69.7ns   FastPow6    87.6ns      125.68%    1.%
'  single_pwr   69.7ns   FastPow5    75.5ns      108.31%    2.%
'  single_pwr   69.8ns  single_pwr   69.8ns      100.00%    0.00001%   XXXXXXXX
'  single_pwr   69.7ns   FastPow4    63.3ns       90.81%    4.%
'  single_pwr   69.8ns   FastPow3    52.0ns       74.42%    8.%
'  single_pwr   69.8ns    Raise      41.7ns       59.72%    0.50%  XXXXXXXXXXXX
'  single_pwr   69.8ns   FastPow2    41.4ns       59.36%   16.%
'  single_pwr   69.9ns   FastPow1    32.5ns       46.52%   32.%

'=======================================================================
' Approximate the power function; written by Richard
'-----------------------------------------------------------------------
' Approximate x ^ b, for x > 0; |b| < 60. Avoid infinite or denormalised x values
Function Raise( Byval x As Single, Byval b As Single ) As Single
    ' to raise x to power b, y = x^b, y = Exp2( b * Log2( x ) )
    ' IEEE 754 Single   Seeeeeeeefffffffffffffffffffffff ' Sign_Log_Linear format
    #define expo_bias &b00111111000000000000000000000000uL ' bias = 126 * 2^23
    #define frac_mask &b00000000011111111111111111111111uL ' positive fraction
    #define frac_bits 23 ' bits used to store fraction, ignoring implicit msb
    #define bias 126 ' exponent that puts single into range from 0.500 to 0.999
    Dim As Long Ptr fp = Cptr( Long Ptr, @x ) ' pointer to bit pattern of x
    '-------- approximate Log2(x) --------------------------------------
    Dim As Long expo = ( *fp Shr frac_bits ) - bias  ' extract exponent
    *fp = ( *fp And frac_mask ) Or expo_bias  ' bias x into range 0.5 to 0.999
    #Define a0 -3.5353058e0     ' log2 polynomial approximation
    #Define a1  8.2616852e0     '   over the range from 0.5 to 1.0
    #Define a2 -8.7235965e0     '   maximum error is 1.54e-4 = 0.000154
    #Define a3  5.3685806e0     ' Zeros are located at
    #Define a4 -1.3713634e0     '   0.500, 0.575, 0.720, 0.885, 1.000
    x = b * ( Csng( expo ) + a0 + ( a1 + ( a2 + ( a3 + a4 * x ) * x ) * x  ) * x )
    '-------- approximate 2^x, is antiLog2( x ) ------------------------
    expo = Int( x ) ' this integer part adjusts the exponent of Alog2(x)
    x -= expo   ' x reduced to fraction in range 0.000 thru 0.999
    x = 1e0 + ( 0.6607687e0 + 0.3392313e0 * x ) * x  ' approx 2^x over range 0 to 1
    *fp += ( expo Shl frac_bits ) ' restore early integer to biased log exponent
    Return x
End Function

'-----------------------------------------------------------------------
'    Maximum error is < ± 0.30% for b < ± 2.5
'    Maximum error is < ± 0.35% for b < ± 7.0
'    Maximum error is < ± 0.50% for b < ± 22.
'    Maximum error is < ± 0.92% for b < ± 60.
'    Beyond which the exponent overflows, so it all falls apart
'
'=======================================================================
' set up graphics screen
Dim As Integer sx, sy, sz
Screeninfo sx, sy, sz
sx -= 32 ' shrink
sy -= 24
sz = 4   ' 16 colours
Screenres sx, sy, sz

'--------------------------------------
' graphics scale
Dim As Single x, y, z, t, q 
Dim As Single lhs = 0.0, rhs = 4.2, lo = -0.5, hi = 0.5
Window ( lhs, lo ) - ( rhs, hi )

Line( lhs,  0 )-( rhs,  0 ), 1
For x = 1 To 4
    Line( x, lo )-( x, hi ), 1
Next x

'=======================================================================
' plot the error curves
Dim As Single max = -1e6, min = 1e6, scale = 40
Dim As Integer i, k = 7

Print
Print " Powers =  ";
'--------------------------------------
For q = -4.5 To 4.51 Step 0.1
    k += 1                  ' 16 colours
    If k > 15 Then k = 8
    Color k
    Print Using "####.##"; q;
    '----------------------------------
    For x = 0 To rhs Step ( rhs - lhs ) / ( sx * 2 )
        y = x^q             ' correct
        '--------------------------------------
        z = FastPow( x, q ) ' Raise( x, q ) '  ' approximation function to plot
        '--------------------------------------
        t = ( z - y ) / y   ' t is the error
        If min > t Then min = t
        If max < t Then max = t
        Pset( x, scale * t ), k
    Next x
Next q

Line( lhs,  0.005 * scale )-( rhs,  0.005 * scale ), 1
Line( lhs, -0.005 * scale )-( rhs, -0.005 * scale ), 1
Draw String ( 0.01, 0.0049 * scale ), " +0.5 %", 15
Draw String ( 0.01, -.0052 * scale ), " -0.5 %", 15
Line( lhs,  max * scale )-( rhs,  max * scale ), 2
Line( lhs,  min * scale )-( rhs,  min * scale ), 2

'=======================================================================
For x = 0 To 4
    Draw String ( x+.01, -0.005), Str(x), 15
Next x
Color 15
Print
Print
Print Using " Error bounds ####.### % ####.### %"; max * 100; min * 100

'=======================================================================
Sleep
'=======================================================================

Richard · Post by **Richard** » May 30, 2019 5:00

Here is a faster version of Raise() that replaces the high order Log2 polynomial with a Log2 table.

This code takes about 35% of the time needed by single x^b. For fastest execution run it with compiler option -sse, without -exx

You can change the internal value of n beyond 11 to make a bigger Log2 table with lower errors. There is no speed penalty for changing n, just the static table storage requirement. Since the minimum error is fixed at about 0.5% by the quadratic antilog approximation, you only need to increase n if you are using higher values of b, say greater than 4.

Code: Select all

'=======================================================================
Function table_pwr( Byval x As Single, Byval b As Single ) As Single
    #define n 11     ' number of bits used to address LU table
    #define max_addr ( 2^n - 1 ) ' also used as table index address mask
    Static As Single table( 0 To max_addr )  ' Log2 fractional mantissa table
    Static As Short i = 0
    If i = 0 Then           ' initialise the table on first call
        For i = 0 To 2^n - 1
            table( i ) = Log( 1 + i / 2^n ) / Log( 2 )  ' table of Log2
        Next i
    End If  ' i is now non-zero so will not initialise again
    ' IEEE 754 Single   Seeeeeeeefffffffffffffffffffffff ' Sign_Log_Linear format
    #define frac_bits 23 ' bits used to store fraction, ignoring implicit msb
    #define bias 127 ' the exponent that makes a single range from 1.0 to 1.999
    Dim As Long Ptr fp = Cptr( Long Ptr, @x )   ' pointer to bit pattern of x
    Dim As Single expo = ( ( *fp Shr frac_bits ) - bias ) ' integer part of log2
    expo += table( ( *fp Shr ( frac_bits - n ) ) And max_addr ) ' add mantissa
    x = expo * b
    '-------- approximate 2^x, is antiLog2( x ) ------------------------
    expo = Int( x ) ' this integer part adjusts the exponent of Alog2(x)
    x -= expo   ' x reduced to fraction in range 0.000 thru 0.999
    x = 1e0 + ( 0.6607687e0 + 0.3392313e0 * x ) * x  ' approx 2^x over range 0 to 1
    *fp += ( expo Shl frac_bits ) ' restore early integer to biased log exponent
    Return x
End Function

Provoni · Post by **Provoni** » May 30, 2019 6:41

It is indeed faster and better and I will be using it instead. Thanks!

Using the following compiler flags: -gen gcc -Wc -march=native,-Ofast,-funroll-loops,-fomit-frame-pointer,-ftree-loop-im,-fivopts,-pipe

Need faster Exponentiate

Re: Need faster Exponentiate

Re: Need faster Exponentiate

Re: Need faster Exponentiate