Snowfall v0.65 build 2018-12-28

UEZ
Posts: 296
Joined: May 05, 2017 19:59
Location: Germany

Snowfall v0.65 build 2018-12-28

Here my current version of snowfall. Maybe I will add also some wind effects using some flow field calculation instead of simplex noise...¯\_(ツ)_/¯

Of course there is a lot of room for improvements e.g. using pre-calculated flake images instead of drawing circles.

Code: Select all

`'Snowfall v0.65 build 2018-12-28'Coded by UEZ Using classes (my 1st attempt^^)#Include "fbgfx.bi"#Include "string.bi"Using FBDeclare Function _ASM_ImageBlur(pImage As Any Ptr, iRadius As Integer, iExpandEdge As Integer = 0) As Any Ptr 'Function by EukalyptusDim Shared As Integer iDW, iDH, scrw, scrhScreencontrol GET_DESKTOP_SIZE, iDW, iDHscrw = iDW * 0.95scrh = iDH * 0.85Const iSnowflakes = 10000' Simplex noise in 2D' from paper http:'webstaff.itn.liu.se/~stegu/simplexnoise/simplexnoise.pdf' https://www.freebasic.net/forum/viewtopic.php?t=20526#p180192Type float As Single 'Double  Dim Shared As Integer perm(512) = { _151,160,137, 91, 90, 15,131, 13,201, 95, 96, 53,194,233,  7,225,_140, 36,103, 30, 69,142,  8, 99, 37,240, 21, 10, 23,190,  6,148, _247,120,234, 75,  0, 26,197, 62, 94,252,219,203,117, 35, 11, 32, _ 57,177, 33, 88,237,149, 56, 87,174, 20,125,136,171,168, 68,175, _ 74,165, 71,134,139, 48, 27,166, 77,146,158,231, 83,111,229,122, _ 60,211,133,230,220,105, 92, 41, 55, 46,245, 40,244,102,143, 54, _ 65, 25, 63,161,  1,216, 80, 73,209, 76,132,187,208, 89, 18,169, _200,196,135,130,116,188,159, 86,164,100,109,198,173,186,  3, 64, _ 52,217,226,250,124,123,  5,202, 38,147,118,126,255, 82, 85,212, _207,206, 59,227, 47, 16, 58, 17,182,189, 28, 42,223,183,170,213, _119,248,152,  2, 44,154,163, 70,221,153,101,155,167, 43,172,  9, _129, 22, 39,253, 19, 98,108,110, 79,113,224,232,178,185,112,104, _218,246, 97,228,251, 34,242,193,238,210,144, 12,191,179,162,241, _ 81, 51,145,235,249, 14,239,107, 49,192,214, 31,181,199,106,157, _184, 84,204,176,115,121, 50, 45,127,  4,150,254,138,236,205, 93, _222,114, 67, 29, 24, 72,243,141,128,195, 78, 66,215, 61,156,180, _151,160,137, 91, 90, 15,131, 13,201, 95, 96, 53,194,233,  7,225,_140, 36,103, 30, 69,142,  8, 99, 37,240, 21, 10, 23,190,  6,148, _247,120,234, 75,  0, 26,197, 62, 94,252,219,203,117, 35, 11, 32, _ 57,177, 33, 88,237,149, 56, 87,174, 20,125,136,171,168, 68,175, _ 74,165, 71,134,139, 48, 27,166, 77,146,158,231, 83,111,229,122, _ 60,211,133,230,220,105, 92, 41, 55, 46,245, 40,244,102,143, 54, _ 65, 25, 63,161,  1,216, 80, 73,209, 76,132,187,208, 89, 18,169, _200,196,135,130,116,188,159, 86,164,100,109,198,173,186,  3, 64, _ 52,217,226,250,124,123,  5,202, 38,147,118,126,255, 82, 85,212, _207,206, 59,227, 47, 16, 58, 17,182,189, 28, 42,223,183,170,213, _119,248,152,  2, 44,154,163, 70,221,153,101,155,167, 43,172,  9, _129, 22, 39,253, 19, 98,108,110, 79,113,224,232,178,185,112,104, _218,246, 97,228,251, 34,242,193,238,210,144, 12,191,179,162,241, _ 81, 51,145,235,249, 14,239,107, 49,192,214, 31,181,199,106,157, _184, 84,204,176,115,121, 50, 45,127,  4,150,254,138,236,205, 93, _222,114, 67, 29, 24, 72,243,141,128,195, 78, 66,215, 61,156,180} Function SimplexNoise2D(xin As float, yin As float, scale As float = 20.0) As float 'by D.J.Peters aka Joshy  Const As float F2 = 0.5*(Sqr(3.0)-1.0)   Const As float G2 = (3.0-Sqr(3.0))/6.0   Const As float G22 = G2 + G2  Static As Integer grad2(11,1) = {{ 1, 1},{-1, 1},{1,-1},{-1,-1}, _                                   { 1, 0},{-1, 0},{1, 0},{-1, 0}, _                                   { 0, 1},{ 0,-1},{0, 1},{ 0,-1}}   Dim As float s = (xin+yin)*F2  Dim As Integer i = Int(xin+s)  Dim As Integer j = Int(yin+s)  Dim As float t = (i+j)*G2   Dim As float x  = i-t  , y = j-t   Dim As float x0 = xin-x, y0 = yin-y   Dim As Integer i1=Any, j1=Any  i And=255  j And=255    If (x0>y0) Then    i1=1: j1=0  Else    i1=0: j1=1  End If           Dim As float x1 = x0 - i1 + G2   Dim As float y1 = y0 - j1 + G2   Dim As float x2 = x0 - 1.0 + G22   Dim As float y2 = y0 - 1.0 + G22   Dim As Integer ii = i 'And 255   Dim As Integer jj = j 'And 255   Dim As Integer ind = Any  Dim As float n=Any  t = 0.5 - x0*x0-y0*y0   If (t<0) Then    n=0  Else     ind = perm(i+perm(j)) Mod 12     n = t*t*t*t  * (grad2(ind,0)*x0 + grad2(ind,1)*y0)  End If   t = 0.5 - x1*x1-y1*y1   If (t<0) Then  Else     ind = perm(i+i1+perm(j+j1)) Mod 12     n+= t*t*t*t  * (grad2(ind,0)*x1 + grad2(ind,1)*y1)  End If   t = 0.5 - x2*x2-y2*y2   If(t<0) Then   Else    i+=1:j+=1      ind= perm(i+perm(j)) Mod 12     n+= t*t*t*t  * (grad2(ind,0)*x2 + grad2(ind,1)*y2)  End If   ' scaled in the interval [-1,1].   Return scale * nEnd Function Function RandomRange(fStart As Single, fEnd As Single) As Single   Return Rnd() * (fEnd - fStart) + fStartEnd FunctionType Snowflake   Public:      Declare Constructor()      Declare Destructor()      Declare Sub Init()      Declare Sub Reset()      Declare Sub update()      As Ushort w, h      As Single x, y, vx, vy, wvx, wvy, radius, AlphaEnd TypeSub Snowflake.init()   This.radius = RandomRange(1, 3)   This.x = Rnd() * (This.w - This.radius)   This.y = Rnd() * (This.h - This.radius)   This.vx = 0   This.vy = 2 * This.radius 'RandomRange(1, 4)   This.Alpha = RandomRange(0.25, 0.95)End SubSub Snowflake.Reset()   This.radius = RandomRange(1, 3)   This.x = Rnd() * (This.w - This.radius)   This.y = Rnd() * -This.radius   This.vx = 0   This.vy = 2 * This.radius   This.Alpha = RandomRange(0.25, 0.95)End SubSub Snowflake.Update()   This.wvx += SimplexNoise2D(This.x * This.x, 2 * This.y) + SimplexNoise2D(This.y, This.x) 'turbulance x   This.wvy += 1.05 * SimplexNoise2D(-This.x, -This.y + This.radius) - SimplexNoise2D(2 * This.y, This.x + This.y + This.radius) 'turbulance y   If This.wvx > 3 Or This.wvx < -3 Then This.wvx = 0   If This.wvy > 3 Or This.wvy < -3 Then This.wvy = 0   This.x += This.wvx   This.y += This.vy + This.wvy / 2   If (This.y > This.h + This.radius) Or (This.x < -This.radius) Or (This.x > This.w) Then This.Reset()End SubConstructor Snowflake()   This.w = scrw   This.h = scrh   This.Init()End ConstructorDestructor Snowflake()End DestructorType Snowflakes      Declare Constructor(n As Ushort = iSnowflakes)      Declare Destructor()      Declare Sub Draw()   Private:      As Ushort w, h, amount            As Snowflake Ptr pBuffer       As Image Ptr Img_Empty, Img_Snowfall, Img_Blur End TypeSub Snowflakes.Draw()   Put This.Img_Snowfall, (0, 0), This.Img_Empty, Pset   For i As Ushort = 0 To This.amount - 1      Circle This.Img_Snowfall, (pBuffer[i].x, pBuffer[i].y), pBuffer[i].radius, Rgba(255, 255, 255, 255 * pBuffer[i].Alpha),,,,F      pBuffer[i].update   Next   This.Img_Blur = _ASM_ImageBlur(This.Img_Snowfall, 2)   Put (0, 0), This.Img_Blur, Trans   Imagedestroy This.Img_BlurEnd SubConstructor Snowflakes(n As Ushort)   With This      .amount = n      .w = scrw                             .h = scrh   End With   Img_Empty = Imagecreate(This.w, This.h, &hFF010512, 32)   Img_Snowfall = Imagecreate(This.w, This.h, , 32)   pBuffer = New Snowflake[amount]End ConstructorDestructor Snowflakes()   Delete[] pBuffer   pBuffer = 0   Imagedestroy This.Img_Empty   Imagedestroy This.Img_SnowfallEnd DestructorScreenres (scrw, scrh, 32, 1, GFX_ALPHA_PRIMITIVES Or GFX_NO_SWITCH Or GFX_ALWAYS_ON_TOP)#Ifdef __Fb_win32__   #Include "windows.bi"   Dim tWorkingArea As RECT   SystemParametersInfo(SPI_GETWORKAREA, null, @tWorkingArea, null)   Screencontrol SET_WINDOW_POS, (iDW - scrw) \ 2, ((tWorkingArea.Bottom - scrh) - (iDH - tWorkingArea.Bottom)) \ 2#Endif                              Windowtitle "Simple Snowfall with " & Format(iSnowflakes, "###,###") & " snowflakes @ " & scrw & "x" & scrh & ". Coded by UEZ"Dim As Snowflakes SnowfallDim As Ulong i, iFPS = 0, iFPS_current = 0Dim As Double fTimer = TimerDo   Screenlock   Snowfall.Draw   Draw String(0, 0), iFPS_current & " fps", Rgb(&hFF, &h00, &h00)   Screenunlock   If Timer - fTimer > 0.99 Then      iFPS_current = iFPS      iFPS = 0      fTimer = Timer   Else      iFPS += 1   Endif   Sleep 1Loop Until Inkey = Chr(27)Function _ASM_ImageBlur(pImage As Any Ptr, iRadius As Integer, iExpandEdge As Integer = 0) As Any Ptr   'By Eukalyptus / modified by D.J. Peters aka Joshy   Dim As Integer iWidth, iHeight, iPX, iPitch, iPitchBlur   Dim As Any Ptr pData, pDataBlur, pDataTmp      If Imageinfo(pImage, iWidth, iHeight, iPX, iPitch, pData) <> 0 Then Return 0   If iPX <> 4 Then Return 0      If iRadius < 0 Then      iRadius = 0   Elseif iRadius > 127 Then      iRadius = 127   Endif      Dim As Any Ptr pImgBlur, pImgTmp   If iExpandEdge <> 0 Then      iWidth += iRadius * 2      iHeight += iRadius * 2   Endif      pImgBlur = Imagecreate(iWidth, iHeight, 0, 32)   pImgTmp = Imagecreate(iWidth, iHeight, 0, 32)      Imageinfo(pImgBlur, , , , iPitchBlur, pDataBlur)   Imageinfo(pImgTmp, , , , , pDataTmp)   If pImgBlur = 0 Orelse pImgTmp = 0 Then      Imagedestroy(pImgBlur)      Imagedestroy(pImgTmp)      Return 0   End If      If iExpandEdge <> 0 Then      Put pImgBlur, (iRadius, iRadius), pImage, Alpha   Else      Put pImgBlur, (0, 0), pImage, Alpha   End If  #Ifndef __Fb_64bit__  #Define REG_SIZE 4  #Define REG_ACCESS DWORD  #Define REG_AX eax  #Define REG_BX ebx  #Define REG_CX ecx  #Define REG_DX edx  #Define REG_DI edi  #Define REG_SI esi  #Define REG_SP esp  #Define REG_BP ebp#Else  #Define REG_SIZE 8  #Define REG_ACCESS QWORD  #Define REG_AX rax  #Define REG_BX rbx  #Define REG_CX rcx  #Define REG_DX rdx  #Define REG_DI rdi  #Define REG_SI rsi  #Define REG_SP rsp  #Define REG_BP rbp#Endif  #Define LOCAL_VAR_SPACE 16*REG_SIZE  'esp/rsp = [X] [Y] [W] [H] [Stride] [R] [pDst] [pSrc] [pDstO] [pSrcO]    #Define X_OFF    [REG_SP]  #Define Y_OFF    [REG_SP+1*REG_SIZE]  #Define W_OFF    [REG_SP+2*REG_SIZE]  #Define H_OFF    [REG_SP+3*REG_SIZE]  #Define S_OFF    [REG_SP+4*REG_SIZE]  #Define R_OFF    [REG_SP+5*REG_SIZE]  #Define DST_OFF  [REG_SP+6*REG_SIZE]  #Define SRC_OFF  [REG_SP+7*REG_SIZE]  #Define DSTO_OFF [REG_SP+8*REG_SIZE]  #Define SRCO_OFF [REG_SP+9*REG_SIZE]      Asm  mov REG_CX, [iWidth]  mov REG_BX, [iHeight]  mov REG_DX, [iPitchBlur]  mov REG_DI, [pDataTmp]  mov REG_SI, [pDataBlur]         mov REG_AX, [iRadius]  inc REG_AX    push REG_BP  mov REG_BP, REG_AX  Sub REG_SP, LOCAL_VAR_SPACE   mov W_OFF,    REG_CX  mov H_OFF,    REG_BX  mov S_OFF,    REG_DX  mov R_OFF,    REG_BP  mov DST_OFF,  REG_DI  mov DSTO_OFF, REG_DI  mov SRC_OFF,  REG_SI  mov SRCO_OFF, REG_SI    mov REG_AX, 0x47000000 'ByteToFloat MSK  movd xmm7, REG_AX  pshufd xmm7, xmm7, 0  ' ####################################################  ' # W-Loop  ' ####################################################   mov REG_BX, H_OFF   mov Y_OFF, REG_BX_Blur_LoopW:  mov REG_DI, DST_OFF  mov REG_SI, SRC_OFF  mov REG_DX, S_OFF 'Stride  Add REG_ACCESS Ptr DST_OFF, 4 'Next RowCol(Transform vertical<->horizontal)  Add SRC_OFF, REG_DX 'Next Row  mov REG_DX, H_OFF 'Y-Stride  Shl REG_DX, 2  pxor xmm6, xmm6 'Reset In-Out  pxor xmm5, xmm5 'Reset Sum  pxor xmm4, xmm4 'UnPack  mov REG_AX, 0 'Reset SumDiv  mov REG_BX, 0 'Reset DivInc  ' ----------------------------------------------------  ' | X-In += Next  ' ----------------------------------------------------  mov REG_BP, 0 'Offset  mov REG_CX, R_OFF 'iR  _Blur_LoopX_In:    movd      xmm0, [REG_SI+REG_BP]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][An][Rn][Gn][Bn] Next    paddw     xmm6, xmm0 'IN+=Next    movdqa    xmm0, xmm6    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    paddd     xmm5, xmm0 'Stack += IN    Add REG_BX, 1 'SumDivInc += 1    Add REG_AX, REG_BX 'SumDiv += Inc    Add REG_BP, 4    Sub REG_CX, 1  jg _Blur_LoopX_In  ' ----------------------------------------------------  ' | XIn += Next / XIn -= Mid / XOut += Mid  ' ----------------------------------------------------  mov REG_CX, R_OFF 'iR  _Blur_LoopX_InOut:    cvtsi2ss  xmm3, REG_AX    rcpss     xmm3, xmm3    pshufd    xmm3, xmm3, 0 'SumDiv    movdqa    xmm0, xmm5    paddd     xmm0, xmm7 ' Ubyte -> Float    subps     xmm0, xmm7 ' /    mulps     xmm0, xmm3    addps     xmm0, xmm7 ' Float -> Ubyte    psubd     xmm0, xmm7 ' /    packssdw  xmm0, xmm0 '[A][R][G][B][A][R][G][B]    packuswb  xmm0, xmm0 '[ARGB][ARGB][ARGB][ARGB]    movd      [REG_DI], xmm0    movd      xmm0, [REG_SI+REG_BP]    movd      xmm1, [REG_SI]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][An][Rn][Gn][Bn] Next    punpcklbw xmm1, xmm4 '[ ][ ][ ][ ][Am][Rm][Gm][Bm] Mid    movlhps   xmm0, xmm1 '[Am][Rm][Gm][Bm][An][Rn][Gn][Bn] = [Mid][Next]    paddw     xmm6, xmm0 'Out+=Mid / IN+=Next    psubw     xmm6, xmm1 '(Out-=Last) / IN-=Mid    movdqa    xmm1, xmm6    movdqa    xmm0, xmm6    punpckhwd xmm1, xmm4 '[AO][RO][GO][BO]    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    psubd     xmm5, xmm1 'Stack -= Out    paddd     xmm5, xmm0 'Stack += IN    Sub       REG_BX, 1 'SumDivInc += 1    Add       REG_AX, REG_BX 'SumDiv += Inc    Add       REG_SI, 4    Add       REG_DI, REG_DX    Sub       REG_CX, 1  jg _Blur_LoopX_InOut  cvtsi2ss  xmm3, REG_AX  rcpss     xmm3, xmm3  pshufd    xmm3, xmm3, 0 'SumDiv  mov       REG_BX, REG_BP  neg       REG_BX 'Last Index  ' ----------------------------------------------------  ' | XIn += Next / XIn -= Mid / XOut += Mid / XOut -= Last  ' ----------------------------------------------------  mov REG_CX, W_OFF 'iWidth  Sub REG_CX, R_OFF  Sub REG_CX, R_OFF  _Blur_LoopX:    movdqa    xmm0, xmm5    paddd     xmm0, xmm7 ' Ubyte -> Float    subps     xmm0, xmm7 ' /    mulps     xmm0, xmm3    addps     xmm0, xmm7 ' Float -> Ubyte    psubd     xmm0, xmm7 ' /    packssdw  xmm0, xmm0 '[A][R][G][B][A][R][G][B]    packuswb  xmm0, xmm0 '[ARGB][ARGB][ARGB][ARGB]    movd      [REG_DI], xmm0    movd xmm0,[REG_SI+REG_BP]    movd xmm1,[REG_SI]    movd xmm2,[REG_SI+REG_BX]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][An][Rn][Gn][Bn] Next    punpcklbw xmm1, xmm4 '[ ][ ][ ][ ][Am][Rm][Gm][Bm] Mid    punpcklbw xmm2, xmm4 '[ ][ ][ ][ ][Al][Rl][Gl][Bl] Last    movlhps   xmm0, xmm1 '[Am][Rm][Gm][Bm][An][Rn][Gn][Bn] = [Mid][Next]    movlhps   xmm1, xmm2 '[Al][Rl][Gl][Bl][Ao][Ro][Go][Bo] = [Last][Mid]    paddw     xmm6, xmm0 'Out+=Mid / IN+=Next    psubw     xmm6, xmm1 'Out-=Last / IN-=Mid    movdqa    xmm1, xmm6    movdqa    xmm0, xmm6    punpckhwd xmm1, xmm4 '[AO][RO][GO][BO]    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    psubd     xmm5, xmm1 'Stack -= Out    paddd     xmm5, xmm0 'Stack += IN    Add       REG_SI, 4    Add       REG_DI, REG_DX    Sub       REG_CX, 1  jg _Blur_LoopX  ' ----------------------------------------------------  ' | XIn -= Mid / XOut += Mid / XOut -= Last  ' ----------------------------------------------------  mov REG_BP, 0 'DivInc  mov REG_CX, R_OFF 'iR  _Blur_LoopX_Out:    cvtsi2ss  xmm3, REG_AX    rcpss     xmm3, xmm3    pshufd    xmm3, xmm3, 0 'SumDiv    movdqa    xmm0, xmm5    paddd     xmm0, xmm7 ' Ubyte -> Float    subps     xmm0, xmm7 ' /    mulps     xmm0, xmm3    addps     xmm0, xmm7 ' Float -> Ubyte    psubd     xmm0, xmm7 ' /    packssdw  xmm0, xmm0 '[A][R][G][B][A][R][G][B]    packuswb  xmm0, xmm0 '[ARGB][ARGB][ARGB][ARGB]    movd      [REG_DI], xmm0    movd      xmm0, [REG_SI]    movd      xmm1, [REG_SI+REG_BX]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][Am][Rm][Gm][Bm] Mid    punpcklbw xmm1, xmm4 '[ ][ ][ ][ ][Al][Rl][Gl][Bl] Last    movlhps   xmm0, xmm1 '[Al][Rl][Gl][Bl][Am][Rm][Gm][Bm] = [Last][Mid]    psubw     xmm6, xmm0 'Out-=Last / IN-=Mid    pslldq    xmm0, 8    paddw     xmm6, xmm0 'Out+=Mid / (IN+=Next)    movdqa    xmm1, xmm6    movdqa    xmm0, xmm6    punpckhwd xmm1, xmm4 '[AO][RO][GO][BO]    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    psubd     xmm5, xmm1 'Stack -= Out    paddd     xmm5, xmm0 'Stack += IN    Add       REG_BP, 1    Sub       REG_AX, REG_BP    Add       REG_SI, 4    Add       REG_DI, REG_DX    Sub       REG_CX, 1  jg _Blur_LoopX_Out  Sub REG_ACCESS Ptr Y_OFF, 1  jg _Blur_LoopW  ' ####################################################  ' # H-Loop  ' ####################################################  mov REG_DI, SRCO_OFF  mov REG_SI, DSTO_OFF  mov DST_OFF, REG_DI  mov SRC_OFF, REG_SI  mov REG_BX, W_OFF  mov X_OFF, REG_BX _Blur_LoopH:  mov REG_DI, DST_OFF  mov REG_SI, SRC_OFF  mov REG_DX, H_OFF  Shl REG_DX, 2  Add REG_ACCESS Ptr DST_OFF, 4 'Next Col  Add SRC_OFF, REG_DX 'Next ColRow  mov REG_DX, S_OFF 'Stride  pxor xmm6, xmm6 'Reset In-Out  pxor xmm5, xmm5 'Reset Sum  pxor xmm4, xmm4 'UnPack  mov REG_AX, 0 'Reset SumDiv  mov REG_BX, 0 'Reset DivInc  ' ----------------------------------------------------  ' | X-In += Next  ' ----------------------------------------------------  mov REG_BP, 0 'Offset  mov REG_CX, R_OFF 'iR    _Blur_LoopY_In:    movd xmm0, [REG_SI+REG_BP]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][An][Rn][Gn][Bn] Next    paddw xmm6, xmm0 'IN+=Next    movdqa xmm0, xmm6    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    paddd xmm5, xmm0 'Stack += IN    Add REG_BX, 1 'SumDivInc += 1    Add REG_AX, REG_BX 'SumDiv += Inc    Add REG_BP, 4    Sub REG_CX, 1  jg _Blur_LoopY_In  ' ----------------------------------------------------  ' | XIn += Next / XIn -= Mid / XOut += Mid  ' ----------------------------------------------------  mov REG_CX, R_OFF 'iR  _Blur_LoopY_InOut:    cvtsi2ss xmm3, REG_AX    rcpss xmm3, xmm3    pshufd xmm3, xmm3, 0 'SumDiv    movdqa xmm0, xmm5    paddd xmm0, xmm7 ' Ubyte -> Float    subps xmm0, xmm7 '/    mulps xmm0, xmm3    addps xmm0, xmm7 ' Float -> Ubyte    psubd xmm0, xmm7 '/    packssdw xmm0, xmm0 '[A][R][G][B][A][R][G][B]    packuswb xmm0, xmm0 '[ARGB][ARGB][ARGB][ARGB]    movd [REG_DI], xmm0    movd xmm0, [REG_SI+REG_BP]    movd xmm1, [REG_SI]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][An][Rn][Gn][Bn] Next    punpcklbw xmm1, xmm4 '[ ][ ][ ][ ][Am][Rm][Gm][Bm] Mid    movlhps xmm0, xmm1 '[Am][Rm][Gm][Bm][An][Rn][Gn][Bn] = [Mid][Next]    paddw xmm6, xmm0 'Out+=Mid / IN+=Next    psubw xmm6, xmm1 '(Out-=Last) / IN-=Mid    movdqa xmm1, xmm6    movdqa xmm0, xmm6    punpckhwd xmm1, xmm4 '[AO][RO][GO][BO]    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    psubd xmm5, xmm1 'Stack -= Out    paddd xmm5, xmm0 'Stack += IN    Sub REG_BX, 1 'SumDivInc += 1    Add REG_AX, REG_BX 'SumDiv += Inc    Add REG_SI, 4    Add REG_DI, REG_DX    Sub REG_CX, 1  jg _Blur_LoopY_InOut  cvtsi2ss xmm3, REG_AX  rcpss xmm3, xmm3  pshufd xmm3, xmm3, 0 'SumDiv  mov REG_BX, REG_BP  neg REG_BX 'Last Index  ' ----------------------------------------------------  ' | XIn += Next / XIn -= Mid / XOut += Mid / XOut -= Last  ' ----------------------------------------------------  mov REG_CX, H_OFF 'iHeight  Sub REG_CX, R_OFF  Sub REG_CX, R_OFF  _Blur_LoopY:    movdqa xmm0, xmm5    paddd xmm0, xmm7 ' Ubyte -> Float    subps xmm0, xmm7 '/    mulps xmm0, xmm3    addps xmm0, xmm7 ' Float -> Ubyte    psubd xmm0, xmm7 '/    packssdw xmm0, xmm0 '[A][R][G][B][A][R][G][B]    packuswb xmm0, xmm0 '[ARGB][ARGB][ARGB][ARGB]    movd [REG_DI], xmm0    movd xmm0, [REG_SI+REG_BP]    movd xmm1, [REG_SI]    movd xmm2, [REG_SI+REG_BX]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][An][Rn][Gn][Bn] Next    punpcklbw xmm1, xmm4 '[ ][ ][ ][ ][Am][Rm][Gm][Bm] Mid    punpcklbw xmm2, xmm4 '[ ][ ][ ][ ][Al][Rl][Gl][Bl] Last    movlhps xmm0, xmm1 '[Am][Rm][Gm][Bm][An][Rn][Gn][Bn] = [Mid][Next]    movlhps xmm1, xmm2 '[Al][Rl][Gl][Bl][Ao][Ro][Go][Bo] = [Last][Mid]    paddw xmm6, xmm0 'Out+=Mid / IN+=Next    psubw xmm6, xmm1 'Out-=Last / IN-=Mid    movdqa xmm1, xmm6    movdqa xmm0, xmm6    punpckhwd xmm1, xmm4 '[AO][RO][GO][BO]    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    psubd xmm5, xmm1 'Stack -= Out    paddd xmm5, xmm0 'Stack += IN        Add REG_SI, 4    Add REG_DI, REG_DX    Sub REG_CX, 1  jg _Blur_LoopY  ' ----------------------------------------------------  ' | XIn -= Mid / XOut += Mid / XOut -= Last  ' ----------------------------------------------------  mov REG_BP, 0 'DivInc  mov REG_CX, R_OFF 'iR  _Blur_LoopY_Out:    cvtsi2ss xmm3, REG_AX    rcpss xmm3, xmm3    pshufd xmm3, xmm3, 0 'SumDiv        movdqa xmm0, xmm5    paddd xmm0, xmm7 ' Ubyte -> Float    subps xmm0, xmm7 '/    mulps xmm0, xmm3    addps xmm0, xmm7 ' Float -> Ubyte    psubd xmm0, xmm7 '/    packssdw xmm0, xmm0 '[A][R][G][B][A][R][G][B]    packuswb xmm0, xmm0 '[ARGB][ARGB][ARGB][ARGB]    movd [REG_DI], xmm0        movd xmm0, [REG_SI]    movd xmm1, [REG_SI+REG_BX]    punpcklbw xmm0, xmm4 '[ ][ ][ ][ ][Am][Rm][Gm][Bm] Mid    punpcklbw xmm1, xmm4 '[ ][ ][ ][ ][Al][Rl][Gl][Bl] Last    movlhps xmm0, xmm1 '[Al][Rl][Gl][Bl][Am][Rm][Gm][Bm] = [Last][Mid]    psubw xmm6, xmm0 'Out-=Last / IN-=Mid    pslldq xmm0, 8    paddw xmm6, xmm0 'Out+=Mid / (IN+=Next)    movdqa xmm1, xmm6    movdqa xmm0, xmm6    punpckhwd xmm1, xmm4 '[AO][RO][GO][BO]    punpcklwd xmm0, xmm4 '[AI][RI][GI][BI]    psubd xmm5, xmm1 'Stack -= Out    paddd xmm5, xmm0 'Stack += IN        Add REG_BP, 1    Sub REG_AX, REG_BP        Add REG_SI, 4    Add REG_DI, REG_DX    Sub REG_CX, 1  jg _Blur_LoopY_Out  Sub REG_ACCESS Ptr X_OFF, 1 jg _Blur_LoopH  Add REG_SP, LOCAL_VAR_SPACE  pop REG_BP  End Asm  Imagedestroy(pImgTmp)  Return pImgBlurEnd Function`

It runs at ~22 fps @ 1520x765 on my "old" notebook.