# FFFF (Fast Floating Fractal Fun) MIPS4 ISA (64 bit) dual pixel R8010 FPU calculation.
 # This code uses 2x unroll reaching 25% theoretical-peak of R8010 (75 MFLOPS @75MHz).
 # Alas, the dual FPUs should really deliver 50% which is the R8010 real-world peak (when not using MADDs only),
 # Seems we cannot hide the *huge* 4 cycles ADD/SUB latencies in the short mandel loop.
 # A future quad-pixel version with 4x unroll should reach 50%.
 # This FPU is fun but the lame 4 cycles latency for ADD/SUB is a showstopper.
 # Author: Daniele Paccaloni


 # Here we tell the assembler not to touch the code in any way.
	.set	noreorder
	.set	noat
	.set	nomacro

	.section .text, 1, 0x00000006, 4, 16
.text:

	.section .lit8, 1, 0x30000002, 8, 8
.lit8:
	.section .text

	# Program Unit: calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25
	.ent	calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25
	.globl	calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25
calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25:	# 0x0
	.dynsym	calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25	sto_default
	.frame	$sp, 0, $31           # leaf routine, don't bother the stack frame.
	#.mask	0x80000000, -80
	# zx = 8
	# zy = 16
	# zx2 = 32
	# zy2 = 40
	# x = 0
	# i = 24
	# lcl_spill_temp_0 = 48

 # bool calcPixelRow_FPU_ASM_MIPSR8000(unsigned int* rowBuffer, unsigned int maxi, unsigned int iter_black, int width, double cx, double cy, double sx) {
.BB1.calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25:	# 0x0
	lui $8,%hi(%neg(%gp_rel(calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25)))	#  calcPixelRow_FPU_ASM_MIPS__GPUiUiT2idN25
	addiu $8,$8,%lo(%neg(%gp_rel(calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25)))	#  calcPixelRow_FPU_ASM_MIPS__GPUiUiT2idN25
	daddu $1,$25,$8               	#

 # [Daniele] We are using MIPS4 ISA (64 bits code).
 # C code passes params in regs:
 #   $4.d = rowBuffer
 #   $5.w = maxi
 #   $6.w = iter_black
 #   $7.w = width
 # $f16.d = cx
 # $f17.d = cy
 # $f18.d = sx
 # We can safely use GPRs $2..$15 and FPRs $f0..$f23.
 #        Return value must be in $2 (bool).

	ldc1 $f4, %gp_rel(.lit8-30720)($1) # 4.0
	andi $7, $7, 0xfffe		# Force width to be even
	add.d $f19, $f16, $f18          # 1: cx = cx + sx
	add.d $f20, $f18, $f18          # 0,1: 2*sx

	.align 4
.NxtPix:
	or $2, $0, $0                   # 0: i = 0
	or $3, $0, $0                   # 1: i = 0
	mov.d $f6, $f16                 # 0: zx = cx
	mov.d $f7, $f19                 # 1: zx = cx (+sx)
	ori $8, $0, +1			# 0: a = 1
	ori $9, $0, +1			# 1: a = 1
	mov.d $f8, $f17                 # 0: zy = cy
	mov.d $f9, $f17                 # 1: zy = cy
.NxtI:
	mul.d $f10, $f6, $f6            # 0: zx2 = zx*zx
	mul.d $f11, $f7, $f7            # 1: zx2 = zx*zx
	slt $11, $2, $5                 # 0: i < maxi ?
	slt $12, $3, $5                 # 1: i < maxi ?

	mul.d $f12, $f8, $f8            # 0: zy2 = zy*zy
	mul.d $f13, $f9, $f9            # 1: zy2 = zy*zy
	and $11, $11, $12               # 0,1: i[0&1] < maxi ?
	beq $11, $0, .DonePix           # 0,1: If (i[0|1] >= maxi) done...

	add.d $f6, $f6, $f6             # 0: 2*zx
	add.d $f7, $f7, $f7             # 1: 2*zx

	add.d $f14, $f10, $f12          # 0: m2 = (zx2 + zy2)
	add.d $f15, $f11, $f13          # 1: m2 = (zx2 + zy2)

	c.lt.d $fcc0, $f4, $f14         # 0: 4.0 < m2 ? (must wait 1 cycle for result)
	c.lt.d $fcc1, $f4, $f15         # 1: 4.0 < m2 ? (must wait 1 cycle for result)

 	madd.d $f8, $f17, $f8, $f6      # 0: zy = 2*zx*zy + cy (MADD seems slower on R12000 !!)
 	madd.d $f9, $f17, $f9, $f7      # 1: zy = 2*zx*zy + cy (MADD seems slower on R12000 !!)

	sub.d $f6, $f10, $f12           # 0: zx = zx2-zy2
	sub.d $f7, $f11, $f13           # 1: zx = zx2-zy2

	add.d $f6, $f6, $f16            # 0: zx += cx
	add.d $f7, $f7, $f19            # 1: zx += cx
	movt $8, $0, $fcc0              # 0: 4.0 < m2 ? if true, stop inc
	movt $9, $0, $fcc1              # 1: 4.0 < m2 ? if true, stop inc

	or $10, $9, $8			# 0,1: Check if both pixels diverge.
	daddu $2, $2, $8                # 0: i++

	bne $10, $0, .NxtI		# 0,1: If some pixel still converging, continue...
	daddu $3, $3, $9                # 1: i++ (delay slot 1)

					@ 0,1: END of loop (both diverging)
	dsubu $2, $2, $8                # 0: i-- (compensate)
	dsubu $3, $3, $9                # 1: i-- (compensate)

.DonePix:
	slt $8, $2, $5                  # 0: i < max_iters ?
	slt $9, $3, $5                  # 1: i < max_iters ?
	add.d $f16, $f16, $f20          # 0: cx = cx + 2*sx
	add.d $f19, $f19, $f20          # 1: cx = cx + 2*sx
	movz $2, $6, $8                 # 0: If (i >= max_iters), i = iter_black
	movz $3, $6, $9                 # 1: If (i >= max_iters), i = iter_black
	addiu $7, $7, -2                # width -= 2
	sw $2, 0($4)                    # 0: [rowBuffer] = i
	sw $3, 4($4)                    # 1: [rowBuffer] = i
	bne $7, $0, .NxtPix             # if (width > 0) calculate next pixel...
	addiu $4, $4, +8                # rowBuffer += 8 (delay slot 2)

	ori $2, $0, 1                	# Return value (true)

	jr $31                        	# Return.
	nop                           	#  

	.end	calcPixelRow_FPU_ASM_MIPSR8000__GPUiUiT2idN25
	.section .text

	.section .lit8
	.origin 0x0
	.align	4
	.dword	0x4010000000000000	# double constant 4.0
	.section .text
	.align 4
	.section .lit8
	.align 3
	.gpvalue 30720