# FFFF (Fast Floating Fractal Fun) MIPS4 ISA (64 bit) dual pixel R8010 FPU calculation. # This code uses 2x unroll reaching 25% theoretical-peak of R8010 (75 MFLOPS @75MHz). # Alas, the dual FPUs should really deliver 50% which is the R8010 real-world peak (when not using MADDs only), # Seems we cannot hide the *huge* 4 cycles ADD/SUB latencies in the short mandel loop. # A future quad-pixel version with 4x unroll should reach 50%. # This FPU is fun but the lame 4 cycles latency for ADD/SUB is a showstopper. # Author: Daniele Paccaloni # Here we tell the assembler not to touch the code in any way. .set noreorder .set noat .set nomacro .section .text, 1, 0x00000006, 4, 16 .text: .section .lit8, 1, 0x30000002, 8, 8 .lit8: .section .text # Program Unit: calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25 .ent calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25 .globl calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25 calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25: # 0x0 .dynsym calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25 sto_default .frame $sp, 0, $31 # leaf routine, don't bother the stack frame. #.mask 0x80000000, -80 # zx = 8 # zy = 16 # zx2 = 32 # zy2 = 40 # x = 0 # i = 24 # lcl_spill_temp_0 = 48 # bool calcPixelRow_FPU_ASM_MIPSR8000(unsigned int* rowBuffer, unsigned int maxi, unsigned int iter_black, int width, double cx, double cy, double sx) { .BB1.calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25: # 0x0 lui $8,%hi(%neg(%gp_rel(calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25))) # calcPixelRow_FPU_ASM_MIPS__GPUiUiT2idN25 addiu $8,$8,%lo(%neg(%gp_rel(calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25))) # calcPixelRow_FPU_ASM_MIPS__GPUiUiT2idN25 daddu $1,$25,$8 # # [Daniele] We are using MIPS4 ISA (64 bits code). # C code passes params in regs: # $4.d = rowBuffer # $5.w = maxi # $6.w = iter_black # $7.w = width # $f16.d = cx # $f17.d = cy # $f18.d = sx # We can safely use GPRs $2..$15 and FPRs $f0..$f23. # Return value must be in $2 (bool). ldc1 $f4, %gp_rel(.lit8-30720)($1) # 4.0 andi $7, $7, 0xfffe # Force width to be even add.d $f19, $f16, $f18 # 1: cx = cx + sx add.d $f20, $f18, $f18 # 0,1: 2*sx .align 4 .NxtPix: or $2, $0, $0 # 0: i = 0 or $3, $0, $0 # 1: i = 0 mov.d $f6, $f16 # 0: zx = cx mov.d $f7, $f19 # 1: zx = cx (+sx) ori $8, $0, +1 # 0: a = 1 ori $9, $0, +1 # 1: a = 1 mov.d $f8, $f17 # 0: zy = cy mov.d $f9, $f17 # 1: zy = cy .NxtI: mul.d $f10, $f6, $f6 # 0: zx2 = zx*zx mul.d $f11, $f7, $f7 # 1: zx2 = zx*zx slt $11, $2, $5 # 0: i < maxi ? slt $12, $3, $5 # 1: i < maxi ? mul.d $f12, $f8, $f8 # 0: zy2 = zy*zy mul.d $f13, $f9, $f9 # 1: zy2 = zy*zy and $11, $11, $12 # 0,1: i[0&1] < maxi ? beq $11, $0, .DonePix # 0,1: If (i[0|1] >= maxi) done... add.d $f6, $f6, $f6 # 0: 2*zx add.d $f7, $f7, $f7 # 1: 2*zx add.d $f14, $f10, $f12 # 0: m2 = (zx2 + zy2) add.d $f15, $f11, $f13 # 1: m2 = (zx2 + zy2) c.lt.d $fcc0, $f4, $f14 # 0: 4.0 < m2 ? (must wait 1 cycle for result) c.lt.d $fcc1, $f4, $f15 # 1: 4.0 < m2 ? (must wait 1 cycle for result) madd.d $f8, $f17, $f8, $f6 # 0: zy = 2*zx*zy + cy (MADD seems slower on R12000 !!) madd.d $f9, $f17, $f9, $f7 # 1: zy = 2*zx*zy + cy (MADD seems slower on R12000 !!) sub.d $f6, $f10, $f12 # 0: zx = zx2-zy2 sub.d $f7, $f11, $f13 # 1: zx = zx2-zy2 add.d $f6, $f6, $f16 # 0: zx += cx add.d $f7, $f7, $f19 # 1: zx += cx movt $8, $0, $fcc0 # 0: 4.0 < m2 ? if true, stop inc movt $9, $0, $fcc1 # 1: 4.0 < m2 ? if true, stop inc or $10, $9, $8 # 0,1: Check if both pixels diverge. daddu $2, $2, $8 # 0: i++ bne $10, $0, .NxtI # 0,1: If some pixel still converging, continue... daddu $3, $3, $9 # 1: i++ (delay slot 1) @ 0,1: END of loop (both diverging) dsubu $2, $2, $8 # 0: i-- (compensate) dsubu $3, $3, $9 # 1: i-- (compensate) .DonePix: slt $8, $2, $5 # 0: i < max_iters ? slt $9, $3, $5 # 1: i < max_iters ? add.d $f16, $f16, $f20 # 0: cx = cx + 2*sx add.d $f19, $f19, $f20 # 1: cx = cx + 2*sx movz $2, $6, $8 # 0: If (i >= max_iters), i = iter_black movz $3, $6, $9 # 1: If (i >= max_iters), i = iter_black addiu $7, $7, -2 # width -= 2 sw $2, 0($4) # 0: [rowBuffer] = i sw $3, 4($4) # 1: [rowBuffer] = i bne $7, $0, .NxtPix # if (width > 0) calculate next pixel... addiu $4, $4, +8 # rowBuffer += 8 (delay slot 2) ori $2, $0, 1 # Return value (true) jr $31 # Return. nop # .end calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25 .section .text .section .lit8 .origin 0x0 .align 4 .dword 0x4010000000000000 # double constant 4.0 .section .text .align 4 .section .lit8 .align 3 .gpvalue 30720